In [53]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, r2_score

In [54]:
MODEL='model.pkl'
PIPELINE='pipeline.pkl'

In [55]:
def pipe(num_attr,cat_attr):
    num=Pipeline([
        ('imputer',SimpleImputer(strategy='median')),
        ('sc',StandardScaler()),
    ])
    cat= Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    full_pipe=ColumnTransformer([
        ('num',num,num_attr),
        ('cat',cat,cat_attr)
    ])
    return full_pipe


In [58]:
if os.path.exists(MODEL):
    model=joblib.load(MODEL)
    pipeline=joblib.load(PIPELINE)
    x_test=pd.read_csv('test_features.csv')
    y_test=pd.read_csv('actual_price.csv')
    prediction=pipeline.predict(x_test)
    pd.DataFrame(prediction,columns=['price']).to_csv('predicted_price.csv',index=False)
    print("model loaded prediction and saved to predicted_price.csv")
    rmse=root_mean_squared_error(y_test,prediction)
    r2=r2_score(y_test,prediction)
    print(f"RMSE: {rmse:.2f}, R²: {r2:.2f}")
else:
    df=pd.read_csv('Housing.csv')
    df1=df.copy()
    X=df1.drop(columns=['price'])
    y=df1['price']
    x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)
    x_test.to_csv("test_features.csv",index=False)
    y_test.to_csv("actual_price.csv",index=False)
    num=x_train.select_dtypes(include=np.number).columns.tolist()
    cat=x_train.select_dtypes(exclude=np.number).columns.tolist()
    preprocessed=pipe(num,cat)
    pipeline=Pipeline([
    ('preprocessor',preprocessed),
    ('model',LassoCV(alphas=[0.0001,0.001,0.01,0.1,1],cv=5, random_state=42, max_iter=10000))
    ])
    #train model
    pipeline.fit(x_train,y_train)
    joblib.dump(pipeline,'pipeline.pkl') #pipeline.pkl saves both model and preprocessor
    joblib.dump(pipeline.named_steps['model'],'model.pkl')#it stores only model for input of preprocesses data
    print("model is trained and saved")

model loaded prediction and saved to predicted_price.csv
RMSE: 1324508.13, R²: 0.65
