**Importing basic packages**

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Importing Data**

In [748]:
 train = pd.read_csv('train.csv')
 test = pd.read_csv('test.csv')
 df = train
#  df = pd.concat([train,test],ignore_index=True)
 df.dtypes.unique()
 df['SalePrice']

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

**Feature Selection**

In [749]:
corr = df.corr()
cats = []
count = 0 
c=0
print('Shape of numeric features before feature selection:',df.shape)
for i in corr['SalePrice']:
  if(i<0.5):
    cats.append(corr.columns[count])
    c+=1    
  count+=1
df.drop(columns=cats,inplace=True)
print(c)
print('Shape of numeric features after feature selection:',df.shape)


Shape of numeric features before feature selection: (1460, 81)
27
Shape of numeric features after feature selection: (1460, 54)


**Preprocessing Numeric features**

In [750]:
#obtaining numeric features (int64 and float64)
df_float = df.loc[:,df.dtypes == np.float64]
df_int=(df.loc[:,df.dtypes == np.int64])
df_num=pd.concat([df_float,df_int],axis=1,ignore_index=False)

#missing data handling
print(df_num.shape)
nulls = pd.DataFrame(df_num.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column initially: ",nulls)
df_num = df_num.interpolate(method='linear', limit_direction='forward', axis=0)
nulls = pd.DataFrame(df_num.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column after handling missing data: ",nulls)


(1460, 11)
Null count in each column initially:                0
SalePrice     0
GarageArea    0
GarageCars    0
TotRmsAbvGrd  0
FullBath      0
Null count in each column after handling missing data:                0
SalePrice     0
GarageArea    0
GarageCars    0
TotRmsAbvGrd  0
FullBath      0


###**Preprocessing Categorical Features**

**Handling missing data in categorical data**

In [751]:
#obtaining categorical features
df_cat=df.select_dtypes(exclude=[np.int64,np.float64])
df_cat=df_cat.astype('str')

#handling missing data in categorical features
nulls = pd.DataFrame(df_cat.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column initially:",nulls) #clearly there are no null values
len(np.unique(df_cat.values))

Null count in each column initially:                0
SaleCondition  0
Condition2     0
ExterQual      0
MasVnrType     0
Exterior2nd    0


175

**Encoding Categorical data**

In [0]:
# # from sklearn.compose import ColumnTransformer
# # from sklearn.preprocessing import OneHotEncoder
# print(df_cat.shape)
# # for i in range(0,len(df_cat.columns)):
# #   ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[i])], remainder='passthrough')
# #   df_cat = pd.DataFrame(ct.fit_transform(df_cat))
# # print(df_cat.columns)

# df_cat = pd.get_dummies(df_cat)
# df_cat.shape

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_cat = df_cat.apply(lambda col: labelencoder.fit_transform(col.astype(str)))

**Splitting Train and Test Data**

In [0]:
#concatenating numerical and categorical features
df_num.drop(columns='SalePrice',inplace=True)
df_total = pd.concat([df_num,df_cat],axis=1,ignore_index=False)

In [754]:
#Splitting as X, y
X = df_total
y = df['SalePrice']

#Splitting train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=5)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(1095, 53) (365, 53) (1095,) (365,)


**Feature Scaling**

In [0]:
y_train= y_train.values.reshape(-1,1)
y_test= y_test.values.reshape(-1,1)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
y_train = sc_X.fit_transform(y_train)
y_test = sc_y.fit_transform(y_test)


##**Training the model using various algortims**

**1. Training using Multi Linear Regression**

In [756]:
from sklearn.linear_model import LinearRegression
regressor1 = LinearRegression()
regressor1.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

**2. Training using Polynomial Regression**

In [0]:
# from sklearn.preprocessing import PolynomialFeatures
# plr = PolynomialFeatures(degree=2)
# X_poly = plr.fit_transform(X_train)
# regressor2 = LinearRegression()
# regressor2.fit(X_poly,y_train)

**3. Training using Decision Tree Regressor**

In [758]:
from sklearn.tree import DecisionTreeRegressor
regressor3 = DecisionTreeRegressor()
regressor3.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

**4. Training using Random Forest Regressor**

In [759]:
from sklearn.ensemble import RandomForestRegressor
regressor4 = RandomForestRegressor(n_estimators=10,random_state=1)
regressor4.fit(X_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

**5. Training using Support Vector Regressor**

In [760]:
from sklearn.svm import SVR
regressor5 = SVR(kernel='rbf')
regressor5.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

##**Testing above models**

**1. Testing using Multi Linear Regression**

In [0]:
y_pred1 = regressor1.predict(X_test)

**2. Testing using Polynomial Regression**

In [0]:
# y_pred2 = regressor2.predict(plr.fit_transform(X_test))

**3. Testing using Decision Tree Regression**

In [0]:
y_pred3 = regressor3.predict(X_test)

**4. Testing using Random Forest Regression**

In [0]:
y_pred4 = regressor4.predict(X_test)

**5. Testing using Support Vector Regression**

In [0]:
y_pred5 = regressor5.predict(X_test)

##**Evaluating each model using R-squared test**

In [0]:
from sklearn.metrics import r2_score, accuracy_score

**1. Evaluating the Multi Linear Regression**

In [767]:
r2_score(y_test, y_pred1)

0.8287837372976723

**2. Evaluating the Polynomial Regression**

In [0]:
# r2_score(y_test, y_pred2)

**3. Evaluating the Decision Tree Regression**

In [769]:
r2_score(y_test, y_pred3)

0.7904622433120827

**4. Evaluating the Random Forest Regression**

In [770]:
r2_score(y_test, y_pred4)

0.8563583620570294

**5. Evaluating the Support Vector Regression**

In [771]:
r2_score(y_test, y_pred5)

0.7424619956135614

##**Predicting the prices using the Best Model**