In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [3]:
train_data.shape

(1460, 81)

In [4]:
test_data.shape

(1459, 80)

In [11]:
train_data.isnull().sum()/len(train_data)*100

Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage      17.739726
LotArea           0.000000
                   ...    
MoSold            0.000000
YrSold            0.000000
SaleType          0.000000
SaleCondition     0.000000
SalePrice         0.000000
Length: 81, dtype: float64

In [10]:
test_data.isnull().sum()/len(test_data)*100

Id                0.000000
MSSubClass        0.000000
MSZoning          0.274160
LotFrontage      15.558602
LotArea           0.000000
                   ...    
MiscVal           0.000000
MoSold            0.000000
YrSold            0.000000
SaleType          0.068540
SaleCondition     0.000000
Length: 80, dtype: float64

In [7]:
train_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [9]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [16]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the data frame with missing values
        return mis_val_table_ren_columns

In [19]:
missing_values_table(train_data)

Your selected dataframe has 76 columns.
There are 14 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
LotFrontage,259,17.7
GarageType,81,5.5
GarageYrBlt,81,5.5
GarageFinish,81,5.5
GarageQual,81,5.5
GarageCond,81,5.5
BsmtExposure,38,2.6
BsmtFinType2,38,2.6
BsmtQual,37,2.5
BsmtCond,37,2.5


In [18]:
# drop columns with more than 40% missing values
train_data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis = 1, inplace = True)

In [20]:
# drop columns with more than 40% missing values
test_data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis = 1, inplace = True)

In [21]:
train_data.dropna(subset=['Electrical','MasVnrArea','MasVnrType'], inplace = True, how = 'any')

In [22]:
missing_values_table(train_data)

Your selected dataframe has 76 columns.
There are 11 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
LotFrontage,257,17.7
GarageType,81,5.6
GarageYrBlt,81,5.6
GarageFinish,81,5.6
GarageQual,81,5.6
GarageCond,81,5.6
BsmtExposure,38,2.6
BsmtFinType2,38,2.6
BsmtQual,37,2.5
BsmtCond,37,2.5


Rest all data need to be imputed

In [23]:
# categorical and numerical
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer

In [33]:
cat_imputer = CategoricalImputer()
mean_imputer = SimpleImputer(strategy = 'mean')

In [27]:
num_cols = train_data.select_dtypes(include = np.number).columns
cat_cols = train_data.select_dtypes(include = np.object).columns
Tnum_cols = test_data.select_dtypes(include = np.number).columns
Tcat_cols = test_data.select_dtypes(include = np.object).columns

In [30]:
print(num_cols,'\n',cat_cols)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object') 
 Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'Heating

In [31]:
num_train = train_data[num_cols]
cat_train = train_data[cat_cols]

In [32]:
num_test = test_data[Tnum_cols]
cat_test = test_data[Tcat_cols]

In [34]:
mean_imputer.fit_transform(num_train)
mean_imputer.fit_transform(num_test)

array([[1.461e+03, 2.000e+01, 8.000e+01, ..., 0.000e+00, 6.000e+00,
        2.010e+03],
       [1.462e+03, 2.000e+01, 8.100e+01, ..., 1.250e+04, 6.000e+00,
        2.010e+03],
       [1.463e+03, 6.000e+01, 7.400e+01, ..., 0.000e+00, 3.000e+00,
        2.010e+03],
       ...,
       [2.917e+03, 2.000e+01, 1.600e+02, ..., 0.000e+00, 9.000e+00,
        2.006e+03],
       [2.918e+03, 8.500e+01, 6.200e+01, ..., 7.000e+02, 7.000e+00,
        2.006e+03],
       [2.919e+03, 6.000e+01, 7.400e+01, ..., 0.000e+00, 1.100e+01,
        2.006e+03]])

In [37]:
test_data['GarageType'] = cat_imputer.fit_transform(test_data['GarageType'])
test_data['GarageYrBlt'] = cat_imputer.fit_transform(test_data['GarageYrBlt'])
test_data['GarageFinish'] = cat_imputer.fit_transform(test_data['GarageFinish'])

test_data['GarageQual'] = cat_imputer.fit_transform(test_data['GarageQual'])

test_data['GarageCond'] = cat_imputer.fit_transform(test_data['GarageCond'])

test_data['BsmtCond'] = cat_imputer.fit_transform(test_data['BsmtCond'])

test_data['BsmtQual'] = cat_imputer.fit_transform(test_data['BsmtQual'])

test_data['BsmtExposure'] = cat_imputer.fit_transform(test_data['BsmtExposure'])

test_data['BsmtFinType1'] = cat_imputer.fit_transform(test_data['BsmtFinType1'])

test_data['BsmtFinType2'] = cat_imputer.fit_transform(test_data['BsmtFinType2'])

test_data['MasVnrType'] = cat_imputer.fit_transform(test_data['MasVnrType'])

test_data['MasVnrArea'] = cat_imputer.fit_transform(test_data['MasVnrArea'])

test_data['MSZoning'] = cat_imputer.fit_transform(test_data['MSZoning'])

test_data['BsmtFullBath'] = cat_imputer.fit_transform(test_data['BsmtFullBath'])
test_data['BsmtHalfBath'] = cat_imputer.fit_transform(test_data['BsmtHalfBath'])
test_data['Utilities'] = cat_imputer.fit_transform(test_data['Utilities'])
test_data['KitchenQual'] = cat_imputer.fit_transform(test_data['KitchenQual'])
test_data['Exterior2nd'] = cat_imputer.fit_transform(test_data['Exterior2nd'])
test_data['Exterior1st'] = cat_imputer.fit_transform(test_data['Exterior1st'])
test_data['SaleType'] = cat_imputer.fit_transform(test_data['SaleType'])



In [38]:
train_data['GarageType'] = cat_imputer.fit_transform(train_data['GarageType'])
train_data['GarageYrBlt'] = cat_imputer.fit_transform(train_data['GarageYrBlt'])
train_data['GarageFinish'] = cat_imputer.fit_transform(train_data['GarageFinish'])

train_data['GarageQual'] = cat_imputer.fit_transform(train_data['GarageQual'])

train_data['GarageCond'] = cat_imputer.fit_transform(train_data['GarageCond'])

train_data['BsmtCond'] = cat_imputer.fit_transform(train_data['BsmtCond'])

train_data['BsmtQual'] = cat_imputer.fit_transform(train_data['BsmtQual'])

train_data['BsmtExposure'] = cat_imputer.fit_transform(train_data['BsmtExposure'])

train_data['BsmtFinType1'] = cat_imputer.fit_transform(train_data['BsmtFinType1'])

train_data['BsmtFinType2'] = cat_imputer.fit_transform(train_data['BsmtFinType2'])


In [54]:
missing_values_table(test_data)

Your selected dataframe has 75 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [46]:
missing_values_table(train_data)

Your selected dataframe has 76 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [42]:
train_data['LotFrontage'] = mean_imputer.fit_transform(train_data[['LotFrontage']])

In [44]:
test_data['LotFrontage'] = mean_imputer.fit_transform(test_data[['LotFrontage']])

In [47]:
test_data['BsmtFinSF1'] = mean_imputer.fit_transform(test_data[['BsmtFinSF1']])

In [48]:
test_data['BsmtFinSF2'] = mean_imputer.fit_transform(test_data[['BsmtFinSF2']])

In [50]:
test_data['TotalBsmtSF'] = mean_imputer.fit_transform(test_data[['TotalBsmtSF']])

In [49]:
test_data['BsmtUnfSF'] = mean_imputer.fit_transform(test_data[['BsmtUnfSF']])

In [51]:
test_data['GarageCars'] = mean_imputer.fit_transform(test_data[['GarageCars']])

In [52]:
test_data['GarageArea'] = mean_imputer.fit_transform(test_data[['GarageArea']])

In [53]:
test_data['Functional'] = cat_imputer.fit_transform(test_data['Functional'])

In [55]:
train_data.shape

(1451, 76)

In [56]:
combine = pd.concat([train_data, test_data], ignore_index = True)

In [57]:
combine.shape

(2910, 76)

In [58]:
dummy_data = pd.get_dummies(combine)

In [59]:
dummy_data.shape

(2910, 272)

In [60]:
train_data = dummy_data.iloc[:1451]
test_data = dummy_data.iloc[1451:]

In [61]:
test_data = test_data.drop(['SalePrice'], axis = 1)

In [62]:
test_data.shape

(1459, 271)

In [63]:
train_data.shape

(1451, 272)

In [68]:
X_train = train_data.drop(['Id','SalePrice'], axis = 1)
y_train = train_data['SalePrice']

In [69]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
principalcomp = pca.fit_transform(X_train)
principaltraindata = pd.DataFrame(data = principalcomp)

In [65]:
principaltestcomp = pca.fit_transform(test_data)
principaltestdata = pd.DataFrame(data = principaltestcomp)

In [70]:
from xgboost import XGBRegressor
model = XGBRegressor()

In [71]:
model.fit(principaltraindata,y_train)



XGBRegressor()

In [72]:
pred = model.predict(principaltestdata)

In [74]:
test_pred = pd.DataFrame(pred)

In [76]:
test_pred.columns = ['SalePrice']

In [86]:
test_pred.head()

Unnamed: 0,SalePrice,Id
1451,116490.445312,1461
1452,434708.0625,1462
1453,256523.859375,1463
1454,214086.8125,1464
1455,181455.390625,1465


In [83]:
test_pred.index = test_data.index

In [85]:
test_pred['Id'] = test_data['Id']

In [87]:
test_pred.shape

(1459, 2)

In [89]:
test_pred.to_csv('submission.csv', index = False)