In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
# read the dataset
df = pd.read_csv('C:\\Users\\Vishaly\\OneDrive - Hong Kong Baptist University\\Miscellaneous\\Vishaly\\Coding\\Data-Science-Projects\\house-prices-advanced-regression-techniques\\data\\test.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [5]:
# identify the missing values
df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [6]:
# encode the ExterQual with 0,1,2,3 and 4
df['ExterQual'] = df['ExterQual'].map({'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

In [7]:
# encode the ExterCond with 0,1,2,3 and 4
df['ExterCond'] = df['ExterCond'].map({'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

In [8]:
# encode the BsmtQual with 0,1,2,3,4 and 5
df['BsmtQual'] = df['BsmtQual'].map({'NA':0 ,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

In [9]:
# encode the BsmtCond with 0,1,2,3,4 and 5
df['BsmtCond'] = df['BsmtCond'].map({'NA':0 ,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

In [10]:
# encode BsmtExposure with 0,1,2,3,4 and 5
df['BsmtExposure'] = df['BsmtExposure'].map({'NA':0 ,'No':1, 'Mn':2, 'Av':3, 'Gd':4})

In [11]:
# encode BsmtFinType1 with 0,1,2,3,4,5 and 6
df['BsmtFinType1'] = df['BsmtFinType1'].map({'NA':0 ,'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})

In [12]:
# encode HeatingQC with 0,1,2,3,4 and 5
df['HeatingQC'] = df['HeatingQC'].map({'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

In [13]:
# encode CentralAir with 0 and 1
df['CentralAir'] = df['CentralAir'].map({'N':0, 'Y':1})

In [14]:
# encode KitchenQual with 0,1,2,3,and 4
df['KitchenQual'] = df['KitchenQual'].map({'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

In [15]:
# encode BsmtFinType2 with 0,1,2,3,4,5 and 6
df['BsmtFinType2'] = df['BsmtFinType2'].map({'NA':0 ,'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})

In [16]:
# encode FireplaceQu with 0,1,2,3,4, and 5
df['FireplaceQu'] = df['FireplaceQu'].map({'NA':0 ,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

In [17]:
# encode GarageQual with 0,1,2,3,4, and 5
df['GarageQual'] = df['GarageQual'].map({'NA':0 ,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

In [18]:
# encode GarageCond with 0,1,2,3,4, and 5
df['GarageCond'] = df['GarageCond'].map({'NA':0 ,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

In [19]:
# encode PoolQC with 0,1,2,3, and 4
df['PoolQC'] = df['PoolQC'].map({'NA':0 ,'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})

In [20]:
# Assuming df is your original DataFrame
# Select only the numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns

# Normalize the selected numerical columns
df_norm = (df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std()

# Display the normalized DataFrame
df_norm.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold
0,-1.730271,-0.874411,0.510333,0.363804,-0.750844,0.400629,-0.340828,-1.072517,-0.566973,-0.677869,...,0.366553,-0.701387,-0.360615,-0.088796,1.818336,-0.057207,,-0.092212,-0.038268,1.713318
1,-1.727897,-0.874411,0.555022,0.897553,-0.054858,0.400629,-0.439544,-1.214492,0.041046,-0.677869,...,2.347063,-0.178765,-0.360615,-0.088796,-0.30144,-0.057207,,19.723675,-0.038268,1.713318
2,-1.725524,0.06133,0.242199,0.809368,-0.750844,-0.497247,0.84377,0.678509,-0.566973,-0.677869,...,0.930176,-0.207799,-0.360615,-0.088796,-0.30144,-0.057207,,-0.092212,-1.140224,1.713318
3,-1.72315,0.06133,0.420955,0.032053,-0.054858,0.400629,0.876675,0.678509,-0.454377,-0.677869,...,2.088735,-0.178765,-0.360615,-0.088796,-0.30144,-0.057207,,-0.092212,-0.038268,1.713318
4,-1.720777,1.464941,-1.143162,-0.971475,1.337113,-0.497247,0.679242,0.394559,-0.566973,1.027322,...,-0.729382,0.489031,-0.360615,-0.088796,2.242291,-0.057207,,-0.092212,-1.874861,1.713318


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

# Generate a regression dataset for demonstration purposes
X, y = make_regression(n_samples=100, n_features=2, noise=0.1)

# Fit the model
model = LinearRegression()
model.fit(X, y)

# Define your test data
# Note: Replace this with your actual test data
Xtest = [
    [-1.07296862, -0.52817175],
    # Add more test samples here
]

# Make predictions
predictions = model.predict(Xtest)

# Print the inputs and predicted outputs
for i, x in enumerate(Xtest):
    print(f"X={x}, Predicted={predictions[i]}")


X=[-1.07296862, -0.52817175], Predicted=-70.85156345239794


In [22]:
# save the model
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [23]:
# load the trained model
import pickle
with open('model.pkl', 'rb') as f:
    lr = pickle.load(f)

# predict the test data
y_pred = lr.predict(Xtest)

# print the predicted values
print(y_pred)

[-70.85156345]


In [24]:
# how many values are predicted
len(y_pred)

1

In [25]:
# fit the entire data set to the model
lr.fit(X, y)