In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
#warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample_sub = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')


In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 85)

In [None]:
df = pd.concat([df_train,df_test])
df

In [None]:
sns.distplot(df['LotFrontage'])

In [None]:
df = df.set_index("Id")
df

In [None]:
#train data had saleprice but test data doesn't that's why lower columns havu saleprice value empty we conact both beacuse we want to do feature scaling and feature transformation at once

In [None]:
df.shape

In [None]:
sample_sub

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.duplicated().sum()


# **FINDING**
# **INFORMATION** **ABOUT** **MISSING** **VALUES**

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
feature_with_na = [ feature for feature in df.columns if df[feature].isnull().sum() >= 1]

for feature in feature_with_na:
      print(feature, np.round(df[feature].isnull().mean(), 4) )

In [None]:
feature_with_na

In [None]:
missing_values =df.isnull().sum()
missing_values = missing_values[missing_values >0]
missing_values.sort_values(inplace =True)

In [None]:
missing_values

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().sum()/2919).sort_values(ascending=False)
missing_data = pd.concat([total,missing_percent], axis=1, keys = ['Total','Percentage'])
missing_data

In [None]:
missing_values.plot.bar(figsize =(20,6))

In [None]:
df['SalePrice'].describe()

In [None]:
sns.distplot(df['SalePrice'], kde= False, bins = 20)

In [None]:
sns.kdeplot(df['SalePrice'])

In [None]:
df['SalePrice'].skew()

In [None]:
df['SalePrice'].kurt()

In [None]:
df.corr()['SalePrice']

# separating numerica and categorical features

In [None]:
numeric_features = df.select_dtypes(include= [np.number])
numeric_features

In [None]:
len(numeric_features.columns)

In [None]:
missing_num =numeric_features.isnull().sum()
missing_num = missing_num[missing_num>0]
missing_num

# finding correlation between numerical features

In [None]:
num_corr = numeric_features.corr()
print(num_corr['SalePrice'].sort_values(ascending = False))

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(num_corr,cmap = "coolwarm",linewidths=0.3)

In [None]:
categorical_features = df.select_dtypes(include= [np.object])
categorical_features.columns

In [None]:
missing_cat =categorical_features.isnull().sum()
missing_cat = missing_cat[missing_cat >0]
missing_cat

In [None]:
['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea','TotalBsmtSF','1stFlrSF','TotRmsAbvGrd', 'FullBath', 'YearRemodAdd','YearBuilt']

In [None]:
cols = ['SalePrice', 'OverallQual', 'GrLivArea','GarageYrBlt','GarageCars', 'GarageArea','TotalBsmtSF','1stFlrSF','YearRemodAdd', 'FullBath', 'YearBuilt']
sns.pairplot(df[cols], size = 2.5)
plt.show();

In [None]:
yr_feature = [feature for feature in df.columns if 'Yr' in feature or 'yr' in feature or 'Year' in feature or 'year' in feature]
yr_feature

In [None]:
df.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Median House Price')
plt.title("House Price vs YearSold")

In [None]:
df.groupby('YearBuilt')['SalePrice'].median().plot()
plt.xlabel('YearBuilt')
plt.ylabel('Median House Price')
plt.title("House Price vs YearBuilt")

In [None]:
df.groupby('YearRemodAdd')['SalePrice'].median().plot()
plt.xlabel('YearRemodAdd')
plt.ylabel('Median House Price')
plt.title("House Price vs YearRemodAdd")

In [None]:
for feature in yr_feature :
        plt.scatter(df[feature],df['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()


In [None]:
discrete_feature = [feature for feature in numeric_features  if len(df[feature].unique()) <30 and ( 'Yr' not in feature or 'yr' not in feature or 'Year' not in feature or 'year' not in feature)]
discrete_feature

In [None]:
df[discrete_feature]

In [None]:
for feature in discrete_feature :
    df.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [None]:
continuous_feature = [feature for feature in  numeric_features  if  feature not in  discrete_feature + yr_feature + ['Id']]
continuous_feature

In [None]:
for feature in continuous_feature:
  df[feature].hist(bins=30)
  plt.xlabel(feature)
  plt.ylabel("Count")
  plt.title(feature)
  plt.show()



In [None]:
#skewness present in data so we have to carry out log transformation
df_copy = df.copy()
for feature in continuous_feature :

    df_copy[feature]=np.log(df_copy[feature])
    df_copy['SalePrice'] = np.log(df_copy['SalePrice'])

    plt.scatter(df_copy[feature],df_copy['SalePrice'])
    plt.xlabel(feature)
    plt.ylabel('SalesPrice')
    plt.title(feature)
    plt.show()

OUTLIER DETECTION

In [None]:
df_copy= df.copy()
for feature in continuous_feature :

    df_copy[feature]=np.log(df_copy[feature])
    df_copy['SalePrice'] = np.log(df_copy['SalePrice'])

    df.boxplot(column=feature)
    plt.ylabel(feature)
    plt.show()


In [None]:
categorical_features.head()

In [None]:
for feature in categorical_features:
  print('Feature is  {}   and the number of categories are {}'.format(feature,len(categorical_features[feature].unique())))

In [None]:
for feature in categorical_features:
    df=df.copy()
    groupby_data = df.groupby(feature)['SalePrice']
    groupby_data.median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

# DROP  FEATURES

In [None]:
missing_feature = missing_percent[missing_percent > 0] * 100
print(missing_feature )
print('no of missing features = ' ,len(missing_feature))
# as per our domain knowledge we can't drop these values because for eg. in 'PoolQC' the nan values indicates that there is no pool which affects the saleprice

In [None]:
categorical_features['PoolQC'].value_counts()

In [None]:
categorical_features['Alley'].value_counts()

In [None]:
categorical_features['Fence'].value_counts()

In [None]:
categorical_features['FireplaceQu'].value_counts()

# as per our domain knowledge we can't drop these values because for eg. in 'PoolQC' the nan values indicates that there is no pool which affects the saleprice

In [None]:
categorical_features['GarageCond'].value_counts()

# Missing Value Imputation

In [None]:
missing_feature.keys().isin(categorical_features)

In [None]:
categorical_nan_feature = categorical_features.isnull().sum()
categorical_nan_feature = categorical_nan_feature [categorical_nan_feature > 0]
print(categorical_nan_feature)
print(len(categorical_nan_feature))

In [None]:
numerical_nan_feature = numeric_features.isnull().sum()
numerical_nan_feature = numerical_nan_feature [numerical_nan_feature > 0]
print(numerical_nan_feature)
print(len(numerical_nan_feature))

Handeling Categorical  Nan Values

In [None]:
for feature in categorical_features:
  print('Feature is  {}   and the number of categories are {}'.format(feature,len(categorical_features[feature].unique())))

In [None]:
df['MSZoning'].value_counts()

In [None]:
"""sns.countplot(df['MSZoning'])"""

In [None]:
df['Alley'].value_counts()

In [None]:
df['Utilities'].value_counts()

In [None]:
sns.countplot(x ='MSZoning', data = df)

In [None]:
cat_feature=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
        'SaleType', 'SaleCondition']
for i in cat_feature:
    sns.countplot(x = i, data = df)
    plt.show()

In [None]:
len(cat_feature)

Handeling MSZoning

In [None]:
df_copy = df.copy()

In [None]:
mszoning = df_copy['MSZoning'].mode()[0]
df_copy['MSZoning'].replace(np.nan, mszoning, inplace= True)
df_copy['MSZoning'].isnull().sum()

In [None]:
def countplot(data, new_df,feature):
  plt.subplot(1,2,1)
  sns.countplot(x =feature, data = df)
  plt.title("old_data")
  plt.subplot(1,2,2)
  sns.countplot(x=feature, data = new_df)
  plt.title("new_data")
countplot(df, df_copy, "MSZoning")    # not so much differnece observed after replacing missing values

Handeling Alley

In [None]:
alley = "Na"
df_copy['Alley'].replace(np.nan, alley, inplace= True)
df_copy['Alley'].isnull().sum()

In [None]:
countplot(df, df_copy, "Alley")

Handeling LotFrontage

In [None]:
"""plt.figure(figsize=(16, 5))
plt.subplot(1, 2, 1)
sns.boxplot(x='LotFrontage', data=df)
plt.subplot(1, 2, 2)
sns.distplot(df['LotFrontage'], bins=50)"""

In [None]:
def creating_box_hist(feature, df, figsize=(10,5)):
    plt.figure(figsize = figsize)
    plt.subplot(121)
    sns.boxplot(data=df, x=feature)
    plt.subplot(122)
    sns.distplot(df[feature])
    plt.show()
creating_box_hist("LotFrontage",df)

In [None]:
lotfontage = df["LotFrontage"].mean()
df_copy["LotFrontage"].replace(np.nan,lotfontage, inplace =True)
df_copy["LotFrontage"].isnull().sum()


In [None]:
def OldNewBoxDist_plot(data, new_df,feature,figsize=(15,7)):
    plt.figure(figsize= figsize)
    plt.subplot(2,2,1)
    sns.boxplot(data=df, x=feature)
    plt.title("old_data")
    plt.subplot(2,2,2)
    sns.distplot(df[feature])
    plt.title("old_data")

    plt.subplot(2,2,3)
    sns.boxplot(data=new_df, x=feature)
    plt.title("new_data")
    plt.subplot(2,2,4)
    sns.distplot(new_df[feature])
    plt.title("new_data")

OldNewBoxDist_plot(df, df_copy, "LotFrontage")

Handeling Utilities

In [None]:
df["Utilities"].value_counts()


In [None]:
df["Utilities"].isnull().sum()

In [None]:
utilities = df["Utilities"].mode()[0]
df_copy["Utilities"].replace (np.nan, utilities, inplace = True)
df_copy["Utilities"].isnull().sum()

Hndeling Exterior1st and Exterior2nd

In [None]:
df["Exterior1st"].value_counts()

In [None]:
df["Exterior2nd"].value_counts()

In [None]:
print(df["Exterior1st"].isnull().sum())
print(df["Exterior2nd"].isnull().sum())

In [None]:
Exterior1st = df["Exterior1st"].mode()[0]
Exterior2nd = df["Exterior2nd"].mode()[0]
df_copy["Exterior1st"].replace(np.nan, Exterior1st, inplace =True)
df_copy["Exterior2nd"].replace(np.nan, Exterior2nd, inplace =True)
print(df_copy["Exterior1st"].isnull().sum())
print(df_copy["Exterior2nd"].isnull().sum())

Handeling MasVnrArea and MasVnrType

In [None]:
df["MasVnrArea"].value_counts()
# numerical data

In [None]:
df["MasVnrType"].value_counts()

In [None]:
df["MasVnrArea"].isnull().sum()

In [None]:
df["MasVnrType"].isnull().sum()

In [None]:
sns.distplot(df["MasVnrArea"])

In [None]:
MasVnrArea_mode = df_copy["MasVnrArea"].mode()[0]
MasVnrArea_mode

In [None]:
MasVnrArea_mode = 0
df_copy["MasVnrArea"].replace(np.nan,MasVnrArea_mode, inplace = True)
df_copy["MasVnrArea"].isnull().sum()

In [None]:
df_copy["MasVnrArea"].isnull().sum()

In [None]:

df_copy["MasVnrType"].replace(np.nan,MasVnrArea_mode, inplace = True)
df_copy["MasVnrType"].isnull().sum()

Handling  BASEMENT FEATURES


In [None]:
num_bsmt =['BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF','BsmtFullBath', 'BsmtHalfBath']
cat_bsmt = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

In [None]:
for feature in num_bsmt:
  print(df[feature].isnull().sum())


In [None]:
for feature in cat_bsmt:
  print(df[feature].isnull().sum())

In [None]:
df["BsmtQual"].value_counts()

In [None]:
for feature in cat_bsmt:
  val=df[feature].value_counts()
  print(feature,val,  sep='\n' )


In [None]:
for feature in cat_bsmt:
  df_copy[feature].replace(np.nan,"Na", inplace = True)


In [None]:
for feature in cat_bsmt:
 missing = df_copy[feature].isnull().sum()
 print(missing)

In [None]:
bsmt_constant = 0
for feature in num_bsmt:
  df_copy[feature].replace(np.nan,bsmt_constant, inplace = True)

In [None]:
for feature in num_bsmt:
 missing_num = df_copy[feature].isnull().sum()
 print(missing_num)

Handeling Electrical

In [None]:
df["Electrical"].value_counts()

In [None]:
df["Electrical"].isnull().sum()

In [None]:
electrical = df_copy["Electrical"].mode()[0]
df_copy["Electrical"].replace(np.nan,electrical,inplace= True)
df_copy["Electrical"].isnull().sum()

Handeling KitchenQual

In [None]:
df["KitchenQual"].isnull().sum()

In [None]:


df['KitchenQual'].value_counts()

In [None]:
df["KitchenAbvGr"].isnull().sum()

In [None]:

df["KitchenAbvGr"].value_counts()

In [None]:
constant = 0

df_copy['KitchenQual'].replace(np.nan,constant, inplace = True)
df_copy["KitchenQual"].isnull().sum()

In [None]:
"""Functional         2   MODE
   FireplaceQu     1420   NA
   PoolQC          2909   NA
   Fence           2348   NA
   MiscFeature     2814   NA
   SaleType           1   mode  """

In [None]:
df["Fireplaces"].isnull().sum()

In [None]:
df["Functional"].value_counts()

In [None]:
df["Functional"].isnull().sum()

In [None]:
functional= df_copy["Functional"].mode()[0]
df_copy["Functional"].replace(np.nan,functional,inplace= True)
df_copy["Functional"].isnull().sum()

In [None]:
df["SaleType"].value_counts()

In [None]:
df["SaleType"].isnull().sum()

In [None]:
functional= df_copy["SaleType"].mode()[0]
df_copy["SaleType"].replace(np.nan,functional,inplace= True)
df_copy["SaleType"].isnull().sum()

In [None]:
df["PoolQC"].value_counts()

In [None]:
other_cat_feature = ['FireplaceQu','PoolQC','Fence','MiscFeature']
for feature in other_cat_feature:
  val=df[feature].value_counts()
  print(feature,val,sep='\n' )

In [None]:
for feature in other_cat_feature:
  missing=df[feature].isnull().sum()
  print(missing )

In [None]:
df["PoolQC"].isnull().sum()

In [None]:
for feature in other_cat_feature:
  df_copy[feature].replace(np.nan,"Na", inplace = True)
df_copy[feature].isnull().sum()

In [None]:
df_copy['Fence'].isnull().sum()

Handeling Garage Feature

In [None]:
"""GarageType
GarageFinish
GarageQual
GarageCond


GarageYrBlt
GarageCars
GarageArea """

In [None]:
cat_garage_feature = ['GarageType', 'GarageFinish','GarageQual','GarageCond']
num_garage_feature = ['GarageYrBlt','GarageCars','GarageArea']
df_garage = df[cat_garage_feature + num_garage_feature]
df_garage[df_garage.isnull().any(axis=1)]


In [None]:
for feature in cat_garage_feature:
    df_copy[feature].replace(np.nan,"Na", inplace = True)

df_copy[feature].isnull().sum()

In [None]:
df_copy['GarageCars'].isnull().sum()

In [None]:
num = 0
for feature in num_garage_feature:
    df_copy[feature].replace(np.nan,0, inplace = True)

df_copy[feature].isnull().sum()

In [None]:
df_copy.isnull().sum()

# Feature Transformation


Converting numerical feature to Categorical feature

In [None]:
num_to_cat_feat = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt','MoSold','YrSold','MSSubClass','MSZoning']
df_copy[num_to_cat_feat].dtypes

In [None]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
  df_copy[feature] = df['YrSold'] - df[feature]



In [None]:
df_copy.head()

In [None]:
"""MSSubClass( the type of dwelling involved in the sale) and MoSold(Month Sold) are actually categorical features but in data they are represented as numerical features/  we can't treate them as numerical because we can't assign a num to the months beacuse as there are  no proper ordring of months(which one will have higher value and which one has lower)"""

In [None]:
 df_copy["MSSubClass"].value_counts()

In [None]:
import calendar

In [None]:
calendar.month_abbr[6]

In [None]:
df_copy["MoSold"]=df_copy["MoSold"].apply(lambda x :  calendar.month_abbr[x])
df_copy["MoSold"].unique()

In [None]:
"""for i in num_to_cat_feat:
  df_copy[i] = df_copy[i].astype(str)"""


In [None]:
df_copy[num_to_cat_feat].dtypes

In [None]:
df_copy['MSSubClass'] = df_copy['MSSubClass'].astype(str)
df_copy['MSSubClass'].dtypes

Converting categorical columns to numerical column

>>>ordinal encoding

In [None]:
ordinal_features = ['ExterQual', 'ExterCond','BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC', 'Functional','GarageFinish','PaveDrive','Utilities']

In [None]:
from pandas.api.types import CategoricalDtype

In [None]:
df_copy['ExterQual'].unique()

In [None]:
#df_copy['ExterQual']= df_copy['ExterQual'].astype(CategoricalDtype(categories = ['Po','Fa','TA','Gd','Ex'], ordered = True)).cat.codes

In [None]:
#df_copy['ExterQual'].value_counts()

In [None]:
df['BsmtFinType1'] = df['BsmtFinType1'].astype(CategoricalDtype(categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ','ALQ', 'GLQ'], ordered = True)).cat.codes
df['BsmtFinType2'] = df['BsmtFinType2'].astype(CategoricalDtype(categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ','ALQ', 'GLQ'], ordered = True)).cat.codes
df['BsmtQual'] = df['BsmtQual'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['BsmtExposure'] = df['BsmtExposure'].astype(CategoricalDtype(categories=['NA', 'Mn', 'Av', 'Gd'], ordered = True)).cat.codes
df['BsmtCond'] = df['BsmtCond'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['ExterQual'] = df['ExterQual'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['ExterCond'] = df['ExterCond'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['Functional'] = df['Functional'].astype(CategoricalDtype(categories=['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod','Min2','Min1', 'Typ'], ordered = True)).cat.codes
df['GarageCond'] = df['GarageCond'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['GarageQual'] = df['GarageQual'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['GarageFinish'] = df['GarageFinish'].astype(CategoricalDtype(categories=['NA', 'Unf', 'RFn', 'Fin'], ordered = True)).cat.codes
df['HeatingQC'] = df['HeatingQC'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['KitchenQual'] = df['KitchenQual'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['PavedDrive'] = df['PavedDrive'].astype(CategoricalDtype(categories=['N', 'P', 'Y'], ordered = True)).cat.codes
df['Utilities'] = df['Utilities'].astype(CategoricalDtype(categories=['ELO', 'NASeWa', 'NASeWr', 'AllPub'], ordered = True)).cat.codes

One Hot Encoding for Nominal Categorical Data

In [None]:
df_copy.info()

In [None]:
df_encd = df_copy.copy()
object_feat = df_encd.select_dtypes(include="object").columns.tolist()
print(len(object_feat))

In [None]:
object_feat

In [None]:
df_encd = pd.get_dummies(df_encd, columns = object_feat,drop_first = True )

In [None]:
df_encd.shape

In [None]:
df_encd.head(5)

In [None]:
df_encd.isnull().sum().sort_values(ascending=False)

In [None]:
df_encd.drop("GarageYrBlt",axis='columns', inplace = True)

In [None]:
df_encd.isnull().sum().sort_values(ascending=False)

In [None]:
df_encd.YrSold

In [None]:
df_encd.drop('YrSold', axis=1, inplace=True)

Split Data

In [None]:
x_train = df_encd[:1460].drop('SalePrice', axis =1)
y_train = df_encd['SalePrice'][:1460]
x_test_forsub = df_encd[1460:].drop('SalePrice', axis =1)

In [None]:
print("shape of x_train : ", x_train.shape)
print("shape of y_train : ",  y_train.shape)
print("shape of x_test_forsub : ",x_test_forsub.shape)

Log Transformation

In [None]:
import numpy as np
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in num_features:
    df_encd[feature]=np.log(df_encd[feature])

In [None]:
df_encd.head()

In [None]:
df_encd.info()

In [None]:
df_encd.isnull().sum()

## FEATURE SCALING

In [None]:
feature_scale=[feature for feature in df_encd.columns if feature not in ['Id']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)



x_train= scaler.transform(x_train)

#x_test_forsub = scaler.transform(x_test)


In [None]:
feature_scale=[feature for feature in df_encd.columns if feature not in ['Id']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(df_encd[feature_scale])

df_encd = scaler.transform(df_encd[feature_scale])


In [None]:
df_encd

In [None]:
x_train

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test,Y_train, Y_test = train_test_split(x_train,y_train ,
                                   random_state=1,
                                   test_size=0.2,
                                   shuffle=True)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

In [None]:
from sklearn.metrics import  r2_score

In [None]:
Y_pred = lr.predict(X_test)

In [None]:
r2_score(Y_test,Y_pred)

In [None]:
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor(max_iter=1000,tol=1e-3, random_state=5)
sgdr.fit(x_train, y_train)


In [None]:
y_pred_sgdr = sgdr.predict(X_test)

In [None]:
r2_score(Y_test,y_pred_sgdr)

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
lr = LinearRegression()
lasso = Lasso()
dstr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
sgdr = SGDRegressor()
gbr = GradientBoostingRegressor(n_estimators=1800, loss='squared_error', subsample = 0.35, learning_rate = 0.05,random_state=1)
xgbr = XGBRegressor()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, r2_score

def test_model(model, x_train,y_train):
    cv = KFold(n_splits = 8, shuffle=True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model, x_train, y_train, cv=cv, scoring = r2)
    score = [r2_val_score.mean()]
    return score

In [None]:
models = {"a":["LinearRegression",lr],
          "b":["Lasso",lasso],
          "c":["DecisionTreeRegressor",dstr],
          "d":["SGDRegressor",sgdr],
          "e":["GradientBoostingRegressor",gbr],
          "f":["XGBRegressor",xgbr],
         "g":["RandomForestRegressor",rfr]}

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, r2_score

def test_model(model, X_train=x_train, Y_train=y_train):
    cv = KFold(n_splits = 3, shuffle=True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model, x_train, y_train, cv=cv, scoring = r2)
    score = [r2_val_score.mean()]
    return score

In [None]:
models_score = []
for model in models:
  print("Training model:",models[model][0])
  score = test_model(models[model][1], x_train, y_train)
  print("score of model :", score)
  models_score.append([models[model][0],score[0]])

In [None]:
gbr = GradientBoostingRegressor(n_estimators=1800, loss='squared_error', subsample = 0.35, learning_rate = 0.05,random_state=1)
gbr.fit(x_train,y_train)
y_pred_gbr = gbr.predict(X_test)

In [None]:
r2_score(Y_test,y_pred_gbr)

In [None]:
dtr = DecisionTreeRegressor(random_state=1)
dtr.fit(x_train,y_train)
y_pred_dtr = dtr.predict(X_test)
r2_score(Y_test,y_pred_dtr)

In [None]:
rfr = RandomForestRegressor(random_state=1)
rfr.fit(x_train,y_train)
y_pred_rfr = rfr.predict(X_test)
r2_score(Y_test,y_pred_rfr)

In [None]:
x_test_forsub

In [None]:
df.index = pd.Index(range(len(df)))

In [None]:
x_test_forsub_copy = x_test_forsub.copy()

In [None]:
x_test_forsub_copy.index = pd.Index(range(len(x_test_forsub)))

In [None]:
x_test_forsub_copy['ID'] = x_test_forsub.index

In [None]:
x_test_forsub_copy

In [None]:
x_test_forsub

In [None]:
x_test_array = x_test_forsub.values
x_test_array

In [None]:
test_predictions = rfr.predict(x_test_array)

In [None]:
x_test_forsub.iloc[:, 0]

In [None]:
test_predictions = gbr.predict(x_test_array)
pred=pd.DataFrame(test_predictions)
datasets=pd.concat([x_test_forsub_copy['ID'] ,pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('Sample_Submission.csv',index=False)

In [None]:
datasets