**IMPORTS**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

**READING DATA FROM FILE**

In [2]:
df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')

****FILLING IN THE GAPS****

In [3]:
df[df.select_dtypes(include=['int', 'float']).columns] = df.select_dtypes(include=['int', 'float']).fillna(value=0)
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).fillna(value='')

**DATA ANALYSIS**

In [4]:
df.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,57.623288,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,34.664304,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,0.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,42.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,63.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
# Finding int and float type columns
obj_columns = df.select_dtypes(include=['object']).columns.tolist()
int_columns = df.select_dtypes(include=['int']).columns.tolist()    # Find integer columns
float_columns = df.select_dtypes(include=['float']).columns.tolist()  # Find float columns

# Printing the results
print("Object Columns:", obj_columns)  # Print object columns
print("Integer Columns:", int_columns)  # Print integer columns
print("Float Columns:", float_columns)    # Print float columns


Object Columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Integer Columns: ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPor

**CONVERTING DATA TO NUMERIC**

In [8]:
le = LabelEncoder()
for col in obj_columns:
    df[col] = le.fit_transform(df[col])

**SEPARATING DATA**

In [9]:
# Finding columns where more than 75% of the values are zero
threshold = 0.75  # 75% threshold
unimp_num_features = [col for col in df.columns if (df[col] == 0).sum() / len(df) > threshold]

# Printing the results
print("Unimportant numerical features (unimp_num_features):", len(unimp_num_features), ":", unimp_num_features)


Unimportant numerical features (unimp_num_features): 15 : ['Alley', 'Utilities', 'LandSlope', 'BldgType', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal']


****PREPARING THE TRAINING DATA****

In [10]:
y = df['SalePrice']
X = df.drop(['Id','SalePrice']+unimp_num_features, axis=1)

In [11]:
X_train, x_test, y_train, y_test = train_test_split(X, y, random_state=60,train_size=0.9)

In [12]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,Neighborhood,Condition1,...,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,4,5,2,...,548,5,5,2,0,61,2,2008,8,4
1,20,3,80.0,9600,1,3,3,2,24,1,...,460,5,5,2,298,0,5,2007,8,4
2,60,3,68.0,11250,1,0,3,4,5,2,...,608,5,5,2,0,42,9,2008,8,4
3,70,3,60.0,9550,1,0,3,0,6,2,...,642,5,5,2,0,35,2,2006,8,0
4,60,3,84.0,14260,1,0,3,2,15,2,...,836,5,5,2,192,84,12,2008,8,4


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 64 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   int64  
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   int64  
 5   LotShape       1460 non-null   int64  
 6   LandContour    1460 non-null   int64  
 7   LotConfig      1460 non-null   int64  
 8   Neighborhood   1460 non-null   int64  
 9   Condition1     1460 non-null   int64  
 10  Condition2     1460 non-null   int64  
 11  HouseStyle     1460 non-null   int64  
 12  OverallQual    1460 non-null   int64  
 13  OverallCond    1460 non-null   int64  
 14  YearBuilt      1460 non-null   int64  
 15  YearRemodAdd   1460 non-null   int64  
 16  RoofStyle      1460 non-null   int64  
 17  RoofMatl       1460 non-null   int64  
 18  Exterior

**TRAINING THE MODEL**
* LGB Model

In [14]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
# Creating a LightGBM Dataset
train_data = lgb.Dataset(X, label=y)

# Define Model Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'dart',
    'num_leaves': 64,
    'learning_rate': 0.25,
    'feature_fraction': 0.6,
    'verbose': -1
}

# Train the Model
lgb_model = lgb.train(params, train_data, num_boost_round=1000)
# Make Predictions on the Test Set
y_pred = lgb_model.predict(x_test, num_iteration=lgb_model.best_iteration)
# Evaluate the Performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Squared Error: 5316154.331322902
R² Score: 0.9993859149678057


**PROCEDURES REQUIRED FOR TESTING**

In [15]:
df_test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv.gz')
df_test[df_test.select_dtypes(include=['object']).columns] = df_test.select_dtypes(include=['object']).fillna(value='')
df_test[df_test.select_dtypes(include=['int', 'float']).columns] = df_test.select_dtypes(include=['int', 'float']).fillna(value=0)
for col in obj_columns:
    df_test[col] = le.fit_transform(df_test[col])
x_test_t2 = df_test.drop(['Id']+unimp_num_features, axis=1)

**SUMMIT**

In [16]:
predictions = lgb_model.predict(x_test_t2)

ids = range(1461, 1461 + len(predictions))

submission = pd.DataFrame({'Id': ids, 'SalePrice': predictions})

submission.to_csv('submission.csv', index=False)