In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score,classification_report, precision_score, recall_score, f1_score
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [52]:
data = pd.read_csv("../data/train.csv")
# data = data.dropna()
print("First few rows of the dataset:")
data.head()
# data.describe()
# data.info()

First few rows of the dataset:


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [53]:
target_column = "SalePrice"
features = data.drop(columns=["PoolQC","Fence","MiscFeature","MasVnrType","Alley","FireplaceQu","Id","EnclosedPorch",
                             "3SsnPorch","ScreenPorch","PoolArea","MiscVal"])

numeric_features = features.select_dtypes(include=["int64","float64"]).columns
cat_features = features.select_dtypes(include=["object"]).columns

features[numeric_features] = features[numeric_features].fillna(features[numeric_features].mean())
features[cat_features] = features[cat_features].fillna(features[cat_features].mode())

target = data[target_column]
features = features.dropna()
# features.info()
features.head(50)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,TA,TA,Y,0,61,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,TA,TA,Y,298,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,TA,TA,Y,0,42,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,TA,TA,Y,0,35,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,TA,TA,Y,192,84,12,2008,WD,Normal,250000
5,50,RL,85.0,14115,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,TA,TA,Y,40,30,10,2009,WD,Normal,143000
6,20,RL,75.0,10084,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,TA,TA,Y,255,57,8,2007,WD,Normal,307000
7,60,RL,70.049958,10382,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,TA,TA,Y,235,204,11,2009,WD,Normal,200000
8,50,RM,51.0,6120,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,Fa,TA,Y,90,0,4,2008,WD,Abnorml,129900
9,190,RL,50.0,7420,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,Gd,TA,Y,0,4,1,2008,WD,Normal,118000


In [54]:
# Converting Categorical Values

features[['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 
              'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
              'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
              'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
              'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 
              'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 
              'GarageCond', 'PavedDrive', 'SaleType', 
              'SaleCondition']] = features[['MSZoning', 'Street', 'LotShape', 
                                                'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
                                                'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                                                'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                                                'Exterior2nd', 'ExterQual', 'ExterCond', 
                                                'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                                                'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 
                                                'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 
                                                'GarageType', 'GarageFinish', 'GarageQual', 
                                                'GarageCond', 'PavedDrive', 
                                                'SaleType', 'SaleCondition']].apply(LabelEncoder().fit_transform)
target = features[target_column]
features.head(50)
features.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 69, dtype: int64

In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)
model = LGBMRegressor()
model.fit(features,target)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000512 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3448
[LightGBM] [Info] Number of data points in the train set: 1346, number of used features: 65
[LightGBM] [Info] Start training from score 187057.309807


In [56]:
y_pred = model.predict(X_test)

In [57]:
# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [58]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Display a sample of predictions
comparison = pd.DataFrame({"Actual": y_test, "Predicted": y_pred}).head(10)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
report = classification_report(y_true, y_pred)

print("\nComparison of actual vs. predicted values:")
print(comparison)
print(report)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Mean Squared Error: 13297709.96
R-squared Score: 1.00

Comparison of actual vs. predicted values:
      Actual      Predicted
969   140000  139973.710043
1240  224900  224410.242735
369   162000  161987.112910
753   275500  276071.884941
303   149900  149905.565247
1222  143000  142981.761165
1004  181000  180899.919141
398    67000   68195.302587
1005  149900  149943.224158
80    193500  193637.156105
