In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import matplotlib as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.ndimage import shift
from IPython.display import clear_output
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
import seaborn
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
train_data = pd.read_csv(Path("/kaggle/input/house-prices-advanced-regression-techniques/train.csv"))
test_data = pd.read_csv(Path("/kaggle/input/house-prices-advanced-regression-techniques/test.csv"))
sample_submission = pd.read_csv(Path("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv"))
train_data = train_data.drop("Id",axis=1)
test_data = test_data.drop("Id",axis=1)

In [4]:
X_train_data = train_data.drop("SalePrice",axis=1)
y_train_data = train_data["SalePrice"]

In [5]:
X_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil

In [7]:
X_train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


**Prepairing the data :**

In [8]:
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(pd.concat([X_train_data,test_data]))

In [9]:
c = X_train_data.columns
i_train = X_train_data.index
i_test = test_data.index
X_train_data = pd.DataFrame(imputer.transform(X_train_data),columns = c,index = i_train)
test_data = pd.DataFrame(imputer.transform(test_data),columns = c,index=i_test)

In [10]:
 str_cat = [ "MSZoning"      ,
  "Street"        ,
  "Alley"         ,
  "LotShape"      ,
  "LandContour"   ,
  "Utilities"     ,
  "LotConfig"     ,
  "LandSlope"     ,
  "Neighborhood"  ,
  "Condition1"    ,
  "Condition2"    ,
  "BldgType"      ,
  "HouseStyle"    ,
  "RoofStyle"     ,
  "RoofMatl"      ,
  "Exterior1st"   ,
  "Exterior2nd"   ,
  "MasVnrType"    ,
  "ExterQual"     ,
  "ExterCond"     ,
  "Foundation"    ,
  "BsmtQual"      ,
  "BsmtCond"      ,
  "BsmtExposure"  ,
  "BsmtFinType1"  ,
  "BsmtFinType2"  ,
  "Heating"       ,
  "HeatingQC"     ,
  "CentralAir"    ,
  "Electrical"    ,
  "KitchenQual"   ,
  "Functional"    ,
  "FireplaceQu"   ,
  "GarageType"    ,
  "GarageFinish"  ,
  "GarageQual"    ,
  "GarageCond"    ,
  "PavedDrive"    ,
  "PoolQC"        ,
  "Fence"         ,
  "MiscFeature"   ,
  "SaleCondition" ,
  "SaleType"      ]

In [11]:
def OneHotE(dataframe,cat_col,cat_encoder):
    dataframe_num = dataframe.drop(cat_col,axis=1)
    dataframe_cat = dataframe[cat_col]
    dataframe_cat = cat_encoder.transform(dataframe_cat)
    dataframe_cat = pd.DataFrame(dataframe_cat.toarray(),columns=cat_encoder.get_feature_names_out(),index=dataframe.index)
    dataframe = pd.concat([dataframe_num,dataframe_cat],axis=1)    
    return dataframe

In [12]:
cat_encoder = OneHotEncoder()

In [13]:
cat_encoder.fit(pd.concat([X_train_data[str_cat],test_data[str_cat]]))

In [14]:
X_train_data = OneHotE(X_train_data,str_cat,cat_encoder)
test_data = OneHotE(test_data,str_cat,cat_encoder)

In [15]:
scaler = StandardScaler()
for column in X_train_data.columns:
    X_train_data[column] = scaler.fit_transform(np.array(X_train_data[column]).reshape(-1,1))

**Training a model:**

In [16]:
pipeline = Pipeline([("classifier",LinearRegression())])

In [17]:
param_grid = [{"classifier__fit_intercept":[True,False]}] 

In [18]:
grid_search = GridSearchCV(pipeline,param_grid,cv=3,scoring="r2")

In [19]:
grid_search.fit(X_train_data,y_train_data)

In [20]:
grid_search.best_params_


{'classifier__fit_intercept': True}

In [21]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.118429,0.028323,0.022316,0.003377,True,{'classifier__fit_intercept': True},-6.722176e+24,-8.903528e+25,-1.2625e+21,-3.191957e+25,4.048e+25,1
1,0.130498,0.011383,0.019915,0.002773,False,{'classifier__fit_intercept': False},-4.167458e+25,-7.241799e+21,-1.608103e+26,-6.749736e+25,6.813964e+25,2


**Prediction :**

In [22]:
final_evaluation = grid_search.predict(test_data)

In [23]:
d = {'Id':np.arange(len(final_evaluation)) + 1461,'SalePrice':final_evaluation}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Id,SalePrice
0,1461,2.214269e+19
1,1462,5.630712e+19
2,1463,5.696008e+19
3,1464,5.533491e+19
4,1465,5.082372e+19
...,...,...
1454,2915,3.446998e+19
1455,2916,3.554393e+19
1456,2917,5.394381e+19
1457,2918,3.951578e+19


In [24]:
df.to_csv('out.csv',index=False)