In [162]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import statsmodels.api as sm

In [163]:
dftrain1=pd.read_csv('train.csv')
dftest1=pd.read_csv('test.csv')

In [164]:
dftrain=dftrain1.copy()
dftest=dftest1.copy()

#Know Your Data

In [165]:
dftrain.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [166]:
dftest.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [167]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [168]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

#Data Cleaning
- Identification and Treatment of Null Values
- Identification and Treatment of Duplicates
- Converting Data Type as required

In [169]:
# Find columns with null values and their counts
null_counts = dftrain.isnull().sum()
null_info = null_counts[null_counts > 0].reset_index()
null_info.columns = ['Column_Name', 'Null_Count']

# Display the result
print(null_info)

     Column_Name  Null_Count
0    LotFrontage         259
1          Alley        1369
2     MasVnrType         872
3     MasVnrArea           8
4       BsmtQual          37
5       BsmtCond          37
6   BsmtExposure          38
7   BsmtFinType1          37
8   BsmtFinType2          38
9     Electrical           1
10   FireplaceQu         690
11    GarageType          81
12   GarageYrBlt          81
13  GarageFinish          81
14    GarageQual          81
15    GarageCond          81
16        PoolQC        1453
17         Fence        1179
18   MiscFeature        1406


In [170]:
print(f'duplicated values: {dftrain.duplicated().sum().item()}')

duplicated values: 0


In [171]:
# Find columns with null values and their counts
null_counts = dftest.isnull().sum()
null_info = null_counts[null_counts > 0].reset_index()
null_info.columns = ['Column_Name', 'Null_Count']

# Display the result
print(null_info)

     Column_Name  Null_Count
0       MSZoning           4
1    LotFrontage         227
2          Alley        1352
3      Utilities           2
4    Exterior1st           1
5    Exterior2nd           1
6     MasVnrType         894
7     MasVnrArea          15
8       BsmtQual          44
9       BsmtCond          45
10  BsmtExposure          44
11  BsmtFinType1          42
12    BsmtFinSF1           1
13  BsmtFinType2          42
14    BsmtFinSF2           1
15     BsmtUnfSF           1
16   TotalBsmtSF           1
17  BsmtFullBath           2
18  BsmtHalfBath           2
19   KitchenQual           1
20    Functional           2
21   FireplaceQu         730
22    GarageType          76
23   GarageYrBlt          78
24  GarageFinish          78
25    GarageCars           1
26    GarageArea           1
27    GarageQual          78
28    GarageCond          78
29        PoolQC        1456
30         Fence        1169
31   MiscFeature        1408
32      SaleType           1


In [172]:
print(f'duplicated values: {dftest.duplicated().sum().item()}')

duplicated values: 0


In [173]:
dftrain.drop(columns=['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [174]:
dftest.drop(columns=['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [175]:
with pd.option_context('display.max_columns', None):
    print(dftrain)

      MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour  \
0             60       RL         65.0     8450   Pave      Reg         Lvl   
1             20       RL         80.0     9600   Pave      Reg         Lvl   
2             60       RL         68.0    11250   Pave      IR1         Lvl   
3             70       RL         60.0     9550   Pave      IR1         Lvl   
4             60       RL         84.0    14260   Pave      IR1         Lvl   
...          ...      ...          ...      ...    ...      ...         ...   
1455          60       RL         62.0     7917   Pave      Reg         Lvl   
1456          20       RL         85.0    13175   Pave      Reg         Lvl   
1457          70       RL         66.0     9042   Pave      Reg         Lvl   
1458          20       RL         68.0     9717   Pave      Reg         Lvl   
1459          20       RL         75.0     9937   Pave      Reg         Lvl   

     Utilities LotConfig LandSlope Neighborhood Con

In [176]:
dftest.sample(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
827,20,RL,91.0,11825,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,112,0,0,0,0,0,6,2007,New,Partial
1272,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,205,0,0,0,0,6,2006,WD,Normal
601,20,RL,75.0,10425,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,4,2008,WD,Normal
649,30,RL,55.0,7111,Pave,IR1,Bnk,AllPub,Inside,Gtl,...,0,0,0,0,0,0,7,2008,WD,Normal
1265,80,RL,80.0,9600,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,78,0,0,0,0,0,7,2006,WD,Normal


In [177]:
# Create a DataFrame with column name, number of unique values, and datatype
column_summary = pd.DataFrame({
    'Column_Name': dftrain.columns,
    'Unique_Values': dftrain.nunique(),
    'Data_Type': dftrain.dtypes
}).reset_index(drop=True)

In [178]:
# Assuming your DataFrame is named df
with pd.option_context('display.max_rows', None):
    print(column_summary)

      Column_Name  Unique_Values Data_Type
0      MSSubClass             15     int64
1        MSZoning              5    object
2     LotFrontage            110   float64
3         LotArea           1073     int64
4          Street              2    object
5        LotShape              4    object
6     LandContour              4    object
7       Utilities              2    object
8       LotConfig              5    object
9       LandSlope              3    object
10   Neighborhood             25    object
11     Condition1              9    object
12     Condition2              8    object
13       BldgType              5    object
14     HouseStyle              8    object
15    OverallQual             10     int64
16    OverallCond              9     int64
17      YearBuilt            112     int64
18   YearRemodAdd             61     int64
19      RoofStyle              6    object
20       RoofMatl              8    object
21    Exterior1st             15    object
22    Exter

In [179]:
'''from google.colab import files

# Save the DataFrame to an Excel file
column_summary.to_excel('column_summary.xlsx', index=False)

# Download the Excel file to your laptop
files.download('column_summary.xlsx')'''

"from google.colab import files\n\n# Save the DataFrame to an Excel file\ncolumn_summary.to_excel('column_summary.xlsx', index=False)\n\n# Download the Excel file to your laptop\nfiles.download('column_summary.xlsx')"

In [180]:
dftrain['MSSubClass']=dftrain['MSSubClass'].astype('category')
dftrain['MoSold']=dftrain['MoSold'].astype('category')

In [181]:
dftest['MSSubClass']=dftest['MSSubClass'].astype('category')
dftest['MoSold']=dftest['MoSold'].astype('category')

In [182]:
# Convert all 'object' columns in dftrain and dftest to 'category'
dftrain = dftrain.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)
dftest = dftest.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)

In [183]:
from sklearn.impute import SimpleImputer

In [184]:
# Step 1: Separate columns, excluding the target variable
target = 'SalePrice'
num_cols = dftrain.select_dtypes(include=['float64', 'int64']).columns.drop(target, errors='ignore')
cat_cols = dftrain.select_dtypes(include=['category']).columns

# Step 2: Impute missing values for numerical columns using the median
imputer_median = SimpleImputer(strategy='median')
dftrain[num_cols] = imputer_median.fit_transform(dftrain[num_cols])

# Step 3: Impute missing values for categorical columns using most_frequent
imputer_most = SimpleImputer(strategy='most_frequent')
dftrain[cat_cols] = imputer_most.fit_transform(dftrain[cat_cols])

# Step 4: Transform the test data
# Ensure only matching columns between train and test are transformed
num_cols_test = [col for col in num_cols if col in dftest.columns]
cat_cols_test = [col for col in cat_cols if col in dftest.columns]

dftest[num_cols_test] = imputer_median.transform(dftest[num_cols_test])
dftest[cat_cols_test] = imputer_most.transform(dftest[cat_cols_test])

In [185]:
# Convert all 'object' columns in dftrain and dftest to 'category'
dftrain = dftrain.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)
dftest = dftest.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)

In [186]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1460 non-null   category
 1   MSZoning       1460 non-null   category
 2   LotFrontage    1460 non-null   float64 
 3   LotArea        1460 non-null   float64 
 4   Street         1460 non-null   category
 5   LotShape       1460 non-null   category
 6   LandContour    1460 non-null   category
 7   Utilities      1460 non-null   category
 8   LotConfig      1460 non-null   category
 9   LandSlope      1460 non-null   category
 10  Neighborhood   1460 non-null   category
 11  Condition1     1460 non-null   category
 12  Condition2     1460 non-null   category
 13  BldgType       1460 non-null   category
 14  HouseStyle     1460 non-null   category
 15  OverallQual    1460 non-null   float64 
 16  OverallCond    1460 non-null   float64 
 17  YearBuilt      1460 non-null   fl

In [187]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1459 non-null   category
 1   MSZoning       1459 non-null   category
 2   LotFrontage    1459 non-null   float64 
 3   LotArea        1459 non-null   float64 
 4   Street         1459 non-null   category
 5   LotShape       1459 non-null   category
 6   LandContour    1459 non-null   category
 7   Utilities      1459 non-null   category
 8   LotConfig      1459 non-null   category
 9   LandSlope      1459 non-null   category
 10  Neighborhood   1459 non-null   category
 11  Condition1     1459 non-null   category
 12  Condition2     1459 non-null   category
 13  BldgType       1459 non-null   category
 14  HouseStyle     1459 non-null   category
 15  OverallQual    1459 non-null   float64 
 16  OverallCond    1459 non-null   float64 
 17  YearBuilt      1459 non-null   fl

#Data Preprocessing
- Encoding
- Scaling

In [188]:
!pip install category_encoders



In [189]:
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2

In [190]:
# Define a function to calculate correlation between categorical columns and the target 'SalePrice'
def calculate_correlation(df, target):
    # For categorical columns, use Chi-squared test to find correlation with the target
    categorical_cols = df.select_dtypes(include=['category', 'object']).columns
    correlation_dict = {}

    for col in categorical_cols:
        # Apply Chi-squared test for independence
        contingency_table = pd.crosstab(df[col], df[target])
        chi2_stat, p_value = chi2(contingency_table, contingency_table.sum(axis=1))
        correlation_dict[col] = p_value[0]  # p-value tells us the correlation strength

    return correlation_dict

# Define encoding strategies
def encode_columns(dftrain, dftest, correlation_dict, target_col='SalePrice'):
    encoded_dftrain = dftrain.copy()
    encoded_dftest = dftest.copy()

    for col in dftrain.select_dtypes(include=['category', 'object']).columns:
        unique_values = len(dftrain[col].unique())

        # Default to One-Hot Encoding if cardinality is low (<= 10)
        if unique_values <= 10:
            print(f"Applying One-Hot Encoding on {col}")
            one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
            train_encoded = one_hot_encoder.fit_transform(dftrain[[col]])
            test_encoded = one_hot_encoder.transform(dftest[[col]])

            # Get the column names for the one-hot encoding
            encoded_colnames = one_hot_encoder.get_feature_names_out([col])
            train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_colnames)
            test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_colnames)

            # Merge the one-hot encoded columns
            encoded_dftrain = encoded_dftrain.join(train_encoded_df)
            encoded_dftest = encoded_dftest.join(test_encoded_df)
            encoded_dftrain.drop(columns=[col], inplace=True)
            encoded_dftest.drop(columns=[col], inplace=True)

        elif unique_values > 10:
            # Check correlation with the target variable
            if correlation_dict.get(col, 1) < 0.05:  # assuming p-value < 0.05 means strong correlation
                print(f"Applying Target Encoding on {col}")
                encoder = ce.TargetEncoder(cols=[col])
                encoded_dftrain[col] = encoder.fit_transform(dftrain[col], dftrain[target_col])
                encoded_dftest[col] = encoder.transform(dftest[col])
            else:
                # Weak correlation, use Frequency Encoding
                print(f"Applying Frequency Encoding on {col}")
                freq_map = dftrain[col].value_counts().to_dict()
                encoded_dftrain[col] = dftrain[col].map(freq_map)
                encoded_dftest[col] = dftest[col].map(freq_map)

    return encoded_dftrain, encoded_dftest

# 1. Calculate correlation of categorical columns with the target 'SalePrice'
correlation_dict = calculate_correlation(dftrain, target='SalePrice')

# 2. Encode the categorical columns based on correlation
dftrain_encoded, dftest_encoded = encode_columns(dftrain, dftest, correlation_dict)

# Check the new dataset with encoded columns
print(dftrain_encoded.head())
print(dftest_encoded.head())

Applying Frequency Encoding on MSSubClass
Applying One-Hot Encoding on MSZoning
Applying One-Hot Encoding on Street
Applying One-Hot Encoding on LotShape
Applying One-Hot Encoding on LandContour
Applying One-Hot Encoding on Utilities
Applying One-Hot Encoding on LotConfig
Applying One-Hot Encoding on LandSlope
Applying Frequency Encoding on Neighborhood
Applying One-Hot Encoding on Condition1
Applying One-Hot Encoding on Condition2
Applying One-Hot Encoding on BldgType
Applying One-Hot Encoding on HouseStyle
Applying One-Hot Encoding on RoofStyle
Applying One-Hot Encoding on RoofMatl
Applying Frequency Encoding on Exterior1st
Applying Frequency Encoding on Exterior2nd
Applying One-Hot Encoding on MasVnrType
Applying One-Hot Encoding on ExterQual
Applying One-Hot Encoding on ExterCond
Applying One-Hot Encoding on Foundation
Applying One-Hot Encoding on BsmtQual
Applying One-Hot Encoding on BsmtCond
Applying One-Hot Encoding on BsmtExposure
Applying One-Hot Encoding on BsmtFinType1
Apply

In [191]:
dftrain_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 186 entries, MSSubClass to SaleCondition_Partial
dtypes: category(2), float64(180), int64(4)
memory usage: 2.1 MB


In [192]:
dftest_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 185 entries, MSSubClass to SaleCondition_Partial
dtypes: category(2), float64(181), int64(2)
memory usage: 2.0 MB


In [193]:
# Display all columns with the data type 'category'
categorical_columns = dftrain_encoded.select_dtypes(include='category')
print(categorical_columns)

     MSSubClass MoSold
0           299     52
1           536    204
2           299     63
3            60     52
4           299     59
...         ...    ...
1455        299    122
1456        536     52
1457         60    204
1458        536    141
1459        536    253

[1460 rows x 2 columns]


In [194]:
freq_map = dftrain_encoded['MSSubClass'].value_counts(normalize=True).to_dict()
dftrain_encoded['MSSubClass'] = dftrain_encoded['MSSubClass'].map(freq_map)
dftest_encoded['MSSubClass'] = dftest_encoded['MSSubClass'].map(freq_map).fillna(0)  # Fallback for unseen

In [195]:
dftest_encoded['MoSold'] = dftest_encoded['MoSold'].astype(str)  # Convert to object type
dftrain_encoded['MoSold'] = dftrain_encoded['MoSold'].astype(str)

freq_map = dftrain_encoded['MoSold'].value_counts(normalize=True).to_dict()

dftrain_encoded['MoSold'] = dftrain_encoded['MoSold'].map(freq_map)
dftest_encoded['MoSold'] = dftest_encoded['MoSold'].map(freq_map).fillna(0)  # Ensure no NaN

In [196]:
freq_map = dftrain_encoded['MoSold'].value_counts(normalize=True).to_dict()

dftrain_encoded['MoSold'] = dftrain_encoded['MoSold'].map(freq_map)
dftest_encoded['MoSold'] = dftest_encoded['MoSold'].map(freq_map)

# Fill any NaN values caused by unseen categories
dftest_encoded['MoSold'] = dftest_encoded['MoSold'].fillna(0)

In [197]:
print("NaN in MSSubClass (train):", dftrain_encoded['MSSubClass'].isnull().sum())
print("NaN in MSSubClass (test):", dftest_encoded['MSSubClass'].isnull().sum())
print("NaN in MoSold (train):", dftrain_encoded['MoSold'].isnull().sum())
print("NaN in MoSold (test):", dftest_encoded['MoSold'].isnull().sum())

NaN in MSSubClass (train): 0
NaN in MSSubClass (test): 0
NaN in MoSold (train): 0
NaN in MoSold (test): 0


In [198]:
dftrain_encoded['MSSubClass'] = dftrain_encoded['MSSubClass'].astype(float)  # Or int if appropriate
dftest_encoded['MSSubClass'] = dftest_encoded['MSSubClass'].astype(float)

dftrain_encoded['MoSold'] = dftrain_encoded['MoSold'].astype(float)  # Or int
dftest_encoded['MoSold'] = dftest_encoded['MoSold'].astype(float)

In [199]:
dftrain_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 186 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(182), int64(4)
memory usage: 2.1 MB


In [200]:
dftest_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 185 entries, MSSubClass to SaleCondition_Partial
dtypes: category(1), float64(182), int64(2)
memory usage: 2.1 MB


In [201]:
# Display all columns with the data type 'category'
categorical_columns = dftest_encoded.select_dtypes(include='category')
print(categorical_columns)

     Exterior2nd
0            504
1            197
2            504
3            504
4            207
...          ...
1454          60
1455          60
1456         504
1457          38
1458         207

[1459 rows x 1 columns]


In [202]:
print(dftrain_encoded['Exterior2nd'].head())
print(dftest_encoded['Exterior2nd'].head())

0    504
1    214
2    504
3     38
4    504
Name: Exterior2nd, dtype: int64
0    504
1    197
2    504
3    504
4    207
Name: Exterior2nd, dtype: category
Categories (15, int64): [20, 3, 7, 25, ..., 26, 504, 197, 38]


In [203]:
dftest_encoded['Exterior2nd'] = dftest_encoded['Exterior2nd'].astype(float)

In [204]:
dftest_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 185 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(183), int64(2)
memory usage: 2.1 MB


**Scaling**

In [205]:
# Separate predictor and target variables
x = dftrain_encoded.drop(columns=['SalePrice'])
y = dftrain_encoded[['SalePrice']]

In [206]:
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and testing data
dftrain_scaled = scaler.fit_transform(x)
dftest_scaled = scaler.transform(dftest_encoded)

In [207]:
!pip install catboost



In [208]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

In [209]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(dftrain_scaled, y, test_size=0.2, random_state=42)

In [210]:
# Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBRegressor": XGBRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "LGBMRegressor": LGBMRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

In [211]:
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    # Handle invalid predictions
    if np.any(preds <= 0) or np.isnan(preds).any():
        print(f"{name} produced invalid predictions. Skipping RMSE calculation.")
        continue

    # Compute RMSE
    preds = np.clip(preds, a_min=1, a_max=None)
    rmse = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds)))
    print(f"{name} RMSE: {rmse:.5f}")


LinearRegression produced invalid predictions. Skipping RMSE calculation.


  return fit_method(estimator, *args, **kwargs)


RandomForest RMSE: 0.15337
XGBRegressor RMSE: 0.14189
CatBoost RMSE: 0.13807
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 120
[LightGBM] [Info] Start training from score 181441.541952


  y = column_or_1d(y, warn=True)


LGBMRegressor RMSE: 0.14904


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


GradientBoosting RMSE: 0.14459


In [212]:
best_model = CatBoostRegressor(verbose=0, random_state=42)  # Selecting the best model
best_model.fit(dftrain_scaled, y)

<catboost.core.CatBoostRegressor at 0x784a2bdb7100>

In [213]:
test_preds = best_model.predict(dftest_scaled)

In [214]:
test_preds

array([123720.98540891, 164975.17673791, 186806.94032301, ...,
       166927.55978968, 116285.17130774, 232659.60001279])

In [216]:
len(test_preds)

1459

In [217]:
submission = pd.DataFrame({
    "ID": range(1461, 1461 + len(test_preds)),  # Generate IDs for test data
    "SalePrice": test_preds  # Predicted house prices
})

# Verify the submission file
print(submission.head())
print(submission.describe())

     ID      SalePrice
0  1461  123720.985409
1  1462  164975.176738
2  1463  186806.940323
3  1464  190563.169797
4  1465  182813.145035
                ID      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  178888.673705
std     421.321334   76127.901582
min    1461.000000   44288.469506
25%    1825.500000  128578.607799
50%    2190.000000  157328.997056
75%    2554.500000  211167.833214
max    2919.000000  534507.953473


In [218]:
submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'
