In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.pandas.set_option('Display.max_columns',None)
pd.pandas.set_option('Display.max_rows',None)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train_Id = pd.DataFrame(train.Id)
test_Id = pd.DataFrame(test.Id )
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


### All Numerical and Categorical Features

In [4]:
# train
numerical_features_train = [feature for feature in train.columns if train[feature].dtype != 'O']
year_feature_train = [feature for feature in numerical_features_train if 'Yr' in feature or "Year" in feature]
discrete_feature_train = [feature for feature in numerical_features_train if len(train[feature].unique()) <=25 
                    and feature not in year_feature_train]
continuous_feature_train = [feature for feature in numerical_features_train 
                      if feature not in discrete_feature_train and feature not in year_feature_train] 
categorical_feature_train = [feature for feature in train.columns if train[feature].dtype == 'O']

In [5]:
#test
numerical_features_test = [feature for feature in test.columns if test[feature].dtype != 'O']
year_feature_test = [feature for feature in numerical_features_test if 'Yr' in feature or "Year" in feature]
discrete_feature_test = [feature for feature in numerical_features_test if len(test[feature].unique()) <=25 
                    and feature not in year_feature_test]
continuous_feature_test = [feature for feature in numerical_features_test 
                      if feature not in discrete_feature_test and feature not in year_feature_test] 
categorical_feature_test = [feature for feature in test.columns if test[feature].dtype == 'O']

### Missing Values in Year Feature

In [6]:
# converting the year to ages
for feature in year_feature_train:
    if feature != 'YrSold':
        train[feature] = train['YrSold'] - train[feature]

# fillng GarageYrBlt value with median
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train.GarageYrBlt.median())

# droping the YrSold feature
train = train.drop('YrSold',axis=1)

# new list after removing YrSold
year_feature_train_new = year_feature_train[:-1]

In [7]:
for feature in year_feature_test:
    if feature != 'YrSold':
        test[feature] = test['YrSold'] - test[feature]
        
# fillng GarageYrBlt value with median
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test.GarageYrBlt.median())

# droping the YrSold feature
test = test.drop('YrSold',axis=1)

# new list after removing YrSold
year_feature_test_new = year_feature_test[:-1]

### Missing Value in Discrete Feature

In [8]:
# there are no missing value in Discrete Feature of train dataset 

In [9]:
# test discrete feature nan
discrete_feature_test_nan = [feature for feature in discrete_feature_test if test[feature].isnull().sum() >=1 ]

# filling nan with median
for feature in discrete_feature_test_nan:
    test[feature] = test[feature].fillna(test[feature].median())

### Missing Value in Continuous Feature

In [10]:
# train
# only missing value in one feature LotFrontage
train['LotFrontage_Nan'] = np.where(train['LotFrontage'].isnull(),1,0)
train['LotFrontage'] = train['LotFrontage'].fillna(train.LotFrontage.median())

In [11]:
# test
continuous_feature_test_nan = [feature for feature in continuous_feature_test if test[feature].isnull().sum() >=1 ]

for feature in continuous_feature_test_nan:
    # capturing the importance of Nan value
    # creating the new feature which contain the Nan value
    test[feature+'_Nan'] = np.where(test[feature].isnull(),1,0)
    
    # replacing with median
    test[feature] = test[feature].fillna(test[feature].median())

### Missing value in Categorical Features

In [12]:
# train
categorical_feature_train_nan = [feature for feature in categorical_feature_train if train[feature].isnull().sum() >=1 ]

for feature in categorical_feature_train_nan:
    # replacing value with new label
    train[feature] = train[feature].fillna('Missing')

In [13]:
# test
categorical_feature_test_nan = [feature for feature in categorical_feature_test if test[feature].isnull().sum() >=1 ]

for feature in categorical_feature_test_nan:
    # replacing value with new label
    test[feature] = test[feature].fillna('Missing')

## Continuous Features Log Transformation

In [14]:
# train
for feature in continuous_feature_train:
    train[feature] = np.log1p(train[feature])

In [15]:
# test
for feature in continuous_feature_test:
    test[feature] = np.log1p(test[feature])

## Handling Rare Categorical Feature
We will remove categorical variables that are present less than 1% of the observations

In [16]:
# train
for feature in categorical_feature_train:
    temp = train[feature].value_counts()/len(train)
    temp_df = temp[temp > 0.01].index
    train[feature] = np.where(train[feature].isin(temp_df),train[feature],'Rare_var')
    
# test
for feature in categorical_feature_test:
    temp = test[feature].value_counts()/len(test)
    temp_df = temp[temp > 0.01].index
    test[feature] = np.where(test[feature].isin(temp_df),test[feature],'Rare_var')
    

## Conerting Categorical to Numerical Variable
1. One Hot Encoding for which have less then 6 categories  (but not used)
2. Frequency Label Encoding for which has More then 6 categories

In [17]:
#train
# label encoding for other features based on frequentness
for feature in categorical_feature_train:
    label_ordered = train[feature].value_counts().index
    label_ordered = {k:i for i,k in enumerate(label_ordered,0)}
    train[feature] = train[feature].map(label_ordered)

In [18]:
# test
# label encoding for other features based on frequentness
for feature in categorical_feature_test:
    label_ordered = test[feature].value_counts().index
    label_ordered = {k:i for i,k in enumerate(label_ordered,0)}
    test[feature] = test[feature].map(label_ordered)

In [19]:
train.shape

(1460, 81)

In [20]:
test.shape

(1459, 86)

## Feature Scaling
1. Standardization(StandardScaler) : follows Standard Normal Distribution where mean = 0 and std = 1
2. Normalization(MinMAxScaler) : convert values in range 0 to 1.

In [21]:
# train
scaled_feature = [feature for feature in train.columns if feature not in ['SalePrice']] 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train[scaled_feature])

# transform the train and test set, and add on the Id and SalePrice variables
train = pd.concat([train[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(train[scaled_feature]), columns=scaled_feature)],
                    axis=1)

## got some missing value here
train['MasVnrArea'] = train['MasVnrArea'].fillna(train.MasVnrArea.median())
train.to_csv('processed_train')

In [22]:
# test 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(test)

# transform the train and test set, and add on the Id and SalePrice variables
test = pd.DataFrame(scaler.transform(test), columns=test.columns)

test.to_csv('processed_test')

## Apply Feature Selection
1. first, I specify the Lasso Regression model, and I
2. select a suitable alpha (equivalent of penalty).
3. The bigger the alpha the less features that will be selected. 4 Then I use the selectFromModel object from sklearn, which will select the features which coefficients are non-zero

In [23]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [24]:
X_train_main = train.drop(['SalePrice'],axis=1)
X_test_main = test
y_train_main = train['SalePrice']

In [25]:
y_train_main.head()

0    12.247699
1    12.109016
2    12.317171
3    11.849405
4    12.429220
Name: SalePrice, dtype: float64

In [26]:
feature_Sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_Sel_model.fit(X_train_main,y_train_main)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [27]:
feature_Sel_model.get_support()

array([False,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True,  True, False, False, False, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False,  True,  True, False,  True, False,
       False,  True, False, False,  True, False, False, False,  True,
       False, False,  True, False, False, False, False,  True, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [28]:
# this is how we can make a list of the selected features
selected_features = X_train_main.columns[(feature_Sel_model.get_support())]

print('total features: {}'.format((X_train_main.shape[1])))
print('selected features: {}'.format(len(selected_features)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(feature_Sel_model.estimator_.coef_ == 0)))

total features: 80
selected features: 18
features with coefficients shrank to zero: 62


In [29]:
X_train_main = X_train_main[selected_features]
X_test_main = X_test_main[selected_features]

## Model Building
1. Linear Regression
2. Ridge Regression 
3. Lasso Regression 
4. Decision Tree Regression 
5. Random Forest
6. KNN Model 
7. Support Vector Machines (SVM)

## Train test split

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train_main,y_train_main,test_size=0.2)

In [31]:
from sklearn.metrics import r2_score

In [44]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)
print(f"Train Score :{svr.score(X_train,y_train)}")
y_pred = svr.predict(X_test)
print(f"R2 score : {r2_score(y_test,y_pred)}")

Train Score :0.9307244153719043
R2 score : 0.8514957449600788


In [33]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train,y_train)
print(f"Train Score :{ridge.score(X_train,y_train)}")
y_pred = ridge.predict(X_test)
print(f"R2 score : {r2_score(y_test,y_pred)}")

Train Score :0.8775523377429749
R2 score : 0.8564952903702293


In [62]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
print(f"Train Score :{rfr.score(X_train,y_train)}")
y_pred = rfr.predict(X_test)
print(f"R2 score : {r2_score(y_test,y_pred)}")

Train Score :0.9810434746974479
R2 score : 0.8521392457333195


In [36]:
from xgboost import XGBRegressor
xgbr = XGBRegressor()
xgbr.fit(X_train,y_train)
print(f"Train Score :{xgbr.score(X_train,y_train)}")
y_pred = xgbr.predict(X_test)
print(f"R2 score : {r2_score(y_test,y_pred)}")

Train Score :0.9986238592708129
R2 score : 0.840065896141658


In [37]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
print(f"Train Score :{dt.score(X_train,y_train)}")
y_pred = dt.predict(X_test)
print(f"R2 score : {r2_score(y_test,y_pred)}")

Train Score :0.9999764189533585
R2 score : 0.7143816151116931


### selected model Xgboost and RandomForest

In [63]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train_main,y_train_main)
y_predicted = rfr.predict(X_test_main)

In [64]:
pred = pd.DataFrame(y_predicted)
sub_df = pd.read_csv('sample_submission.csv')
datasets = pd.concat([sub_df['Id'],pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('house3_submission_RFR.csv',index=False)

In [65]:
from xgboost import XGBRegressor
xgbr = XGBRegressor()
xgbr.fit(X_train_main,y_train_main)
y_predicted = xgbr.predict(X_test_main)

In [66]:
pred = pd.DataFrame(y_predicted)
sub_df = pd.read_csv('sample_submission.csv')
datasets = pd.concat([sub_df['Id'],pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('house3_submission_XGBR.csv',index=False)