# Importing necessary libraries


In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_log_error
import math
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn import preprocessing

# Model Building

In [158]:
houses_pred = pd.read_csv('train (2).csv')

## Model training 

### Dataset loading and splitting into train and test

In [159]:
X = houses_pred

In [160]:
X = X.drop("SalePrice", axis=1)

In [161]:
y = houses_pred["SalePrice"]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Preprocessing and feature engineering of the train set

In [163]:
X_train = X_train[['MSSubClass', 'MSZoning','Utilities', 'OverallQual', 'OverallCond', 'GarageArea', 'BldgType']]

In [164]:
mmscaler = MinMaxScaler()
cols = ['GarageArea']
mmscaler_obj = mmscaler.fit(X_train[cols])
X_train[cols] = mmscaler_obj.transform(X_train[cols])

In [165]:
encoder = OneHotEncoder(handle_unknown='ignore')
onehotenc_obj = encoder.fit(X_train[['MSZoning', 'BldgType', 'Utilities']])
filtered_transform_1 = pd.DataFrame(onehotenc_obj.transform(X_train[['MSZoning', 'BldgType', 'Utilities']]).toarray())
filtered_transform_1.columns = onehotenc_obj.get_feature_names(['MSZoning', 'BldgType', 'Utilities'])
X_train = pd.concat([X_train, filtered_transform_1], axis =1)



In [166]:
X_train = X_train.drop(columns = ['MSZoning', 'Utilities', 'BldgType'], axis=1)

In [167]:
X_train = X_train.dropna()
new_df_2 = pd.merge(X_train,y_train,how='inner',right_index = True, left_index=True)
y = new_df_2['SalePrice']
X = new_df_2.drop(columns = ['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Model training

In [169]:
model = LinearRegression()
model.fit(X_train, y_train)

In [172]:
joblib.dump(mmscaler_obj, 'models/mmscaler.joblib')

['models/mmscaler.joblib']

In [173]:
joblib.dump(onehotenc_obj, 'models/onehotencoder.joblib')

['models/onehotencoder.joblib']

In [174]:
joblib.dump(model, 'models/model.joblib')

['models/model.joblib']

###  Model evaluation

In [170]:
y_pred = abs(model.predict(X_test))

In [171]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

compute_rmsle(y_test, y_pred)

0.25

## Model Inference

### Reading data from a given file (test.csv file in your case)

In [192]:
test_df = pd.read_csv('test (1).csv')

### Preprocessing and feature engineering of this data

In [193]:
X_test_df = test_df[['MSSubClass', 'MSZoning','Utilities', 'OverallQual', 'OverallCond', 'GarageArea', 'BldgType']]

In [194]:
mmscaler = joblib.load('models/mmscaler.joblib')

In [195]:
X_test_df[cols] = mmscaler.transform(X_test_df[cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df[cols] = mmscaler.transform(X_test_df[cols])


In [196]:
onehotencoder = joblib.load('models/onehotencoder.joblib')

In [197]:
filtered_transform_1 = pd.DataFrame(onehotencoder.transform(X_test_df[['MSZoning', 'BldgType', 'Utilities']]).toarray())
filtered_transform_1.columns = onehotencoder.get_feature_names(['MSZoning', 'BldgType', 'Utilities'])
X_test_df = pd.concat([X_test_df, filtered_transform_1], axis =1)



In [198]:
X_test_df = X_test_df.drop(columns = ['MSZoning', 'Utilities', 'BldgType'], axis=1)

In [199]:
X_test_df = X_test_df.dropna()

In [200]:
lin_reg_model = joblib.load('models/model.joblib')

### Predicting the house prices of this data

In [201]:
y_pred_test = abs(lin_reg_model.predict(X_test_df))
y_pred_test

array([167362.15500866, 166342.18961277, 141498.5084672 , ...,
       156909.41014508,  87434.59932243, 232343.06763317])