In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingRegressor
from xgboost.sklearn import XGBRegressor, XGBRFRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from collections import Counter
import matplotlib.pyplot as plt


In [2]:
train_path = './data/train.csv'
test_path = './data/test.csv'

train_df = pd.read_csv(train_path)
train_target = train_df.SalePrice
train_df = train_df.drop('SalePrice', axis = 1)

test_df = pd.read_csv(test_path)


In [None]:
sns.pairplot(train_df)

In [None]:
for col in train_df.columns:
    print(Counter(train_df[col]))
    

In [None]:
for col in train_df.columns:
    

In [None]:
print(train_df.columns[train_df.isnull().sum() !=0])

In [None]:
print(test_df.columns[test_df.isnull().sum() !=0])

**It seems like all the above columns have missing values, but if we read the data explanation we can see that NA is treated in some features as a categorical value.
so we need to encode these catecgorical features first before we handle the missing values**

In [3]:
def encoding(data):
    for col in data.columns:
        if (data[col].dtype == 'object'):
            data[col] = pd.Categorical(data[col])
            data[col] = data[col].cat.codes
            if (len(data[data[col]== -1]) !=0):
                data[col] = data[col] + 1
    return data

In [4]:
train_df = encoding(train_df)
test_df = encoding(test_df)

In [None]:
print(train_df.columns[train_df.isnull().sum() !=0])
print(test_df.columns[test_df.isnull().sum() !=0])

In [5]:
train_df = train_df.drop('Id', axis = 1)

In [None]:
sns.heatmap(train_df[['LotFrontage', 'LandContour', 'LotShape', 'LotConfig', 'LandSlope']].corr(), cmap = 'BrBG', fmt = '.2f', annot = True)

In [7]:
def lotFront_nanVal(data):
    nan_index = data[data['LotFrontage'].isnull()].index
    print("Number of nan values: ", len(nan_index))
    for i in nan_index:

        aveg = None
        td = data[(data['LotConfig'].iloc[i] == data['LotConfig']) # mask of SibSp values in train_data with same value of ith raw SibSp value
                                & (data['LotShape'].iloc[i] == data['LotShape'])].index

        aveg = data.loc[td , 'LotFrontage'].dropna().mean()
        # if these values were unique then aveg won't have a value
        # so we substitue with the median of all Age values
        if not np.isnan(aveg):
            data.loc[i, 'LotFrontage'] = aveg
        else:
            r =  data['LotFrontage'].dropna().mean()
            data.loc[i, 'LotFrontage'] = r
    return data

In [8]:
train_df = lotFront_nanVal(train_df)

Number of nan values:  259


In [9]:
train_df['LotFrontage'].isnull().sum()

0

In [10]:
def MasVnrArea_nanVal(data):
    nan_index = data[data['MasVnrArea'].isnull()].index
    nan_index.append((data['MasVnrArea'] == 0).index)
    print("Number of nan values: ", len(nan_index))
    for i in nan_index:

        aveg = None
        td = data[(data['MasVnrType'].iloc[i] == data['MasVnrType'])].index

        aveg = data.loc[td , 'MasVnrArea'].dropna().mean()
        # if these values were unique then aveg won't have a value
        # so we substitue with the median of all Age values
        if not np.isnan(aveg):
            data.loc[i, 'MasVnrArea'] = aveg
        else:
            r =  data['MasVnrArea'].dropna().mean()
            data.loc[i, 'MasVnrArea'] = r
    return data

In [11]:
train_df ['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].dropna().median())
train_df = MasVnrArea_nanVal(train_df)
train_df ['MasVnrArea'] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].dropna().median())
train_df ['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].dropna().median())



Number of nan values:  8


In [12]:
train_df['MasVnrArea'].isnull().sum()

0

In [13]:
train_df.isnull().sum().sum()

0

In [14]:
y = train_target

In [15]:
print(len(train_df.columns))

80


In [None]:
pca = PCA(n_components = 60)
pca.fit(train_df)

x = pca.transform(train_df)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x, train_target, random_state = 0, test_size = 0.12)
print(len(y_train), len(y_val))

sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_val = sc.transform(X_val)

In [None]:
vr = XGBRegressor(random_state = 0).fit(X_train, y_train)
#lr6 = GradientBoostingRegressor(random_state = 0).fit(X_train, y_train)
lr7 = XGBRFRegressor(random_forest = 0).fit(X_train, y_train)
#lr7 = HistGradientBoostingRegressor(max_depth = 5).fit(X_train, y_train)

#vr = VotingRegressor(estimators = [ ('lasso', lr2), ('rcv', lr5),  ('GBR', lr6), ('xgbR', lr4), ('xgbrfr', lr7)]).fit(X_train, y_train)
pred = vr.predict(X_val)
#pred = lr4.predict(X_val)
print(np.sqrt(mean_squared_error(y_val,pred)))

In [None]:
test_df = encoding(test_df)
test_df ['LotFrontage'] = test_df['LotFrontage'].fillna(test_df['LotFrontage'].dropna().median())
test_df ['MasVnrArea'] = test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].dropna().median())
test_df ['BsmtFinSF1'] = test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].dropna().median())

test_df ['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(test_df['GarageYrBlt'].dropna().median())

Ids = test_df.Id
test_df = test_df.drop('Id' , axis = 1)


In [None]:
print(test_df.isnull().sum().sum())

In [None]:
print(test_df.columns[test_df.isnull().sum() !=0])

In [None]:
test_df ['TotalBsmtSF'] = test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].dropna().median())
test_df ['GarageArea'] = test_df['GarageArea'].fillna(test_df['GarageArea'].dropna().median())
test_df ['BsmtFinSF2'] = test_df['BsmtFinSF2'].fillna(test_df['BsmtFinSF2'].dropna().median())
test_df ['BsmtUnfSF'] = test_df['BsmtUnfSF'].fillna(test_df['BsmtUnfSF'].dropna().median())
test_df ['BsmtFullBath'] = test_df['BsmtFullBath'].fillna(test_df['BsmtFullBath'].dropna().median())
test_df ['BsmtHalfBath'] = test_df['BsmtHalfBath'].fillna(test_df['BsmtHalfBath'].dropna().median())
test_df ['GarageCars'] = test_df['GarageCars'].fillna(test_df['GarageCars'].dropna().median())

In [None]:
test_df = pca.transform(test_df)
#test_df = sc.transform(test_df)

predictions = vr.predict(test_df)
data = {'Id': Ids, 'SalePrice': predictions}
output = pd.DataFrame(data)
output.to_csv('out.csv', index=False)

output.head()