#Installing Dependencies

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install xgboost
!pip install pickle
!pip install scipy
!pip install sklearn

# EDA

In [None]:
import math
import csv
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from xgboost import XGBRegressor
from scipy import stats,special
from sklearn.svm import SVR
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
%matplotlib inline
sns.set()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# There are 81 columns: 79 Features + Id + SalePrice
dataset = pd.read_csv("train.csv")
dataset.describe()

In [None]:
# Separating numerical and categorical data
numerical_features = dataset.dtypes[dataset.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_features))

categorical_features = dataset.dtypes[dataset.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_features))

In [None]:
print(dataset[numerical_features].columns)
print("*"*100)
print(dataset[categorical_features].columns)

In [None]:
# List of features with missing values
total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent*100], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
# There are 19 features having missing values 
# 16 Categorical
# 3 Numerical
categorical_null = []
numerical_null = []
for index,row in missing_data.iterrows():
    if row['Total'] != 0:
        if index in categorical_features:
            categorical_null.append(index)
        else:
            numerical_null.append(index)

In [None]:
print(categorical_null)
print("*"*100)
print(numerical_null)

In [None]:
# Sales price is not a normal distribution
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
sns.distplot(dataset['SalePrice']);
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice(in $)")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(list(dataset[numerical_features]), 1):
    plt.subplot(len(list(numerical_features)), 3, i)
    sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=dataset)
        
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('SalePrice', size=15, labelpad=12.5)
    
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    
    plt.legend(loc='best', prop={'size': 10})
        
plt.show()

In [None]:
# Categorical data and their unique value counts
for catg in list(categorical_features) :
    print(dataset[catg].value_counts())
    print('#'*50)

# Feature Engineering

In [None]:
# Loading the both test and train set

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# Removing ID since it is unique for each datapoint
train_ID = train['Id']
test_ID = test['Id']
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [None]:
# Remove outliers
cols = [feature for feature in numerical_features if feature != 'Id' and feature != 'SalePrice']
Q1 = train[cols].quantile(0.01)
Q99 = train[cols].quantile(0.99)
train = train[~((train[cols] < (Q1)) |(train[cols] > (Q99))).any(axis=1)]

# Replace with median

train.reset_index(drop=True, inplace=True)

In [None]:
# Normalising the dependent variable
train["SalePrice"] = np.log1p(train["SalePrice"])
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(train['SalePrice']);
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# Split features and labels
train_labels = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

# Combine train and test features in order to apply the feature transformation pipeline to the entire dataset
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape

### Dealing with Missing Values

In [None]:
# Visualize missing values
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')
missing = round(all_features.isnull().mean()*100,2)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")
# Tweak the visual presentation
ax.xaxis.grid(False)
ax.set(ylabel="Percent of missing values")
ax.set(xlabel="Features")
ax.set(title="Percent missing data by feature")
sns.despine(trim=True, left=True)

Check for missing attributes

In [None]:
all_features['Functional'] = all_features['Functional'].fillna('Typ')
all_features['Electrical'] = all_features['Electrical'].fillna("SBrkr")
all_features['KitchenQual'] = all_features['KitchenQual'].fillna("TA")

# Replace the missing values in each of the columns below with their mode
all_features['Exterior1st'] = all_features['Exterior1st'].fillna(all_features['Exterior1st'].mode()[0])
all_features['Exterior2nd'] = all_features['Exterior2nd'].fillna(all_features['Exterior2nd'].mode()[0])
all_features['SaleType'] = all_features['SaleType'].fillna(all_features['SaleType'].mode()[0])
all_features['MSZoning'] = all_features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

# the data description stats that NA refers to "No Pool"
all_features["PoolQC"] = all_features["PoolQC"].fillna("NA")
# Replacing the missing values with 0, since no garage = no cars in garage
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_features[col] = all_features[col].fillna(0)                              # Check for GarageYrBlt
# Replacing the missing values with None
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    all_features[col] = all_features[col].fillna('None')
# NaN values for these categorical basement features, means there's no basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_features[col] = all_features[col].fillna('None')
# Replacing Nan value for SaleType as Other
all_features['SaleType'] = all_features['SaleType'].fillna('Oth')
all_features['FireplaceQu'] = all_features['FireplaceQu'].fillna('NA')
all_features['Fence'] = all_features['Fence'].fillna('NA')
all_features['Alley'] = all_features['Alley'].fillna('NA')
all_features['MiscFeature'] = all_features['MiscFeature'].fillna('NA')

# Group the by neighborhoods, and fill in missing value by the median LotFrontage of the neighborhood
all_features['LotFrontage'] = all_features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# We have no particular intuition around how to fill in the rest of the categorical features
# So we replace their missing values with None
objects = []
for i in all_features.columns:
    if all_features[i].dtype == object:
        objects.append(i)
all_features.update(all_features[objects].fillna('None'))
    
# And we do the same thing for numerical features, but this time with 0s
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in all_features.columns:
    if all_features[i].dtype in numeric_dtypes:
        numeric.append(i)
all_features.update(all_features[numeric].fillna(0))   

In [None]:
# Check if all the missing values have been dealt with or not
data = pd.DataFrame(all_features)
df_cols = list(pd.DataFrame(data))
dict_x = {}
for i in range(0, len(df_cols)):
    dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
df_miss = sorted(dict_x.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss

In [None]:
# for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
#     all_features[feature] = all_features['YrSold'] - all_features[feature]
#     all_features[feature < 0] = 0

In [None]:
# Find skewed numerical features
skew_features = all_features[[feature for feature in numerical_features if feature != 'Id' and feature != 'SalePrice']].apply(lambda x: stats.skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(26)

In [None]:
# We need a normal distribution since sklean library assumes the data to have a normal distribution and
# does not perform well otherwise. Here we are trying to get a transformation which will give us a normal distribution.
# For this we are using box - cox transform of 1 + x
# y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
#     log(1+x)                    if lmbda == 0
# lmbda is calculated using boxcox_normmax

for i in skew_index:
    all_features[i] = special.boxcox1p(all_features[i], stats.boxcox_normmax(all_features[i] + 1))

In [None]:
num_features = all_features.dtypes[all_features.dtypes != "object"].index
for col in num_features:
    skew = all_features[col].skew()
    print('{:15}'.format(col), '{:05.2f}'.format(skew))

In [None]:
# One hot encoding ccategorical data because the models can only work with numerical data

# Label encode for random forest
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.shape

In [None]:
# Remove any duplicated column names
all_features = all_features.loc[:,~all_features.columns.duplicated()]
all_features.shape

In [None]:
# Split to test & train data
X_train = all_features.iloc[:len(train_labels), :]
X_test = all_features.iloc[len(train_labels):, :]

In [None]:
# Heatmap
data = X_train.copy()[[feature for feature in num_features if feature != 'Id' and feature != 'SalePrice']].join(train_labels)
nr_feats=len(data.columns)

corr = data.corr()
corr_abs = corr.abs()
cols = corr_abs.nlargest(nr_feats, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(data[cols].values.T)

plt.figure(figsize=(nr_feats/1.5, nr_feats/1.5))
sns.set(font_scale=1.25)
sns.heatmap(cm, linewidths=1.5, annot=True, square=True, fmt='.2f', annot_kws={'size': 12}, 
            yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Remove correrated features

In [None]:
# Features with strong correlation to SalePrice
corr = X_train.copy().join(train_labels).corr()
corr_abs = corr.abs()

ser_corr = corr_abs.nlargest(len(X_train.columns), 'SalePrice')['SalePrice']

cols_abv_corr_limit = list(ser_corr[ser_corr.values > 0.3].index)
cols_bel_corr_limit = list(ser_corr[ser_corr.values <= 0.3].index)

print(ser_corr.head(30))
print("*"*30)
print("List of numerical features with r above 0.3 :")
print(cols_abv_corr_limit)
print("*"*30)
print("List of numerical features with r below 0.3 :")
print(cols_bel_corr_limit)

Variance vs Components Plot

In [None]:
def plot(dim_red):
  plt.plot(np.cumsum(dim_red.explained_variance_ratio_))
  plt.xlabel('number of components')
  plt.ylabel('cumulative explained variance');
  plt.plot()

Getting feature set using different

In [None]:
# Correlation 
X_train_corr = X_train[[f for f in cols_abv_corr_limit if f != "SalePrice"]]
X_test_corr = X_test[[f for f in cols_abv_corr_limit if f != "SalePrice"]]
X_train_corr.to_csv('x_train_corr_removed.csv', index=False)
X_test_corr.to_csv('x_test_corr_removed.csv', index=False)

In [None]:
# PCA
pca = PCA(n_components = 0.99998, svd_solver = 'full')
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
pd.DataFrame(X_train_pca).to_csv('x_train_pca_removed.csv', index=False)
pd.DataFrame(X_test_pca).to_csv('x_test_pca_removed.csv', index=False)
plot(pca)

In [None]:
# SVD
svd = TruncatedSVD(50)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)
pd.DataFrame(X_train_corr).to_csv('x_train_svd_removed.csv', index=False)
pd.DataFrame(X_test_corr).to_csv('x_test_svd_removed.csv', index=False)
plot(svd)

In [None]:
y_train = train_labels
y_train.to_csv('y_train_removed.csv', index=False)

### Time to train

In [None]:
def timeToTrain(model, X, y):
  startTime = time.time()
  model.fit(X,y)
  endTime = time.time()
  return endTime - startTime

### Metrics

In [None]:
def get_metrics(X_test, y_test, y_pred):
  r2 = r2_score(y_test, y_pred)
  adj_r2 = 1 - ((1 - r2) * ((X_test.shape[0]-1) / (X_test.shape[0]-X_test.shape[1]-1)))

  print("\nMetrics")
  print("MSE =", mean_squared_error(y_test, y_pred))
  print("RMSE =", math.sqrt(mean_squared_error(y_test, y_pred)))
  print("MAE =", mean_absolute_error(y_test, y_pred))
  print("R2 Score =", r2)
  print("Adjusted R2 Score =", adj_r2)

###Save and Load Models

In [None]:
def saveModel(model, filename):
    # save the model to disk
    pickle.dump(model, open(filename, 'wb'))

def loadModel(filename):
    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

# Training Different Models

### Ridge Regression

In [None]:
def RidgeRegression(X, y, X__test, fileName):
  ridge = Ridge(max_iter=3000, tol=0.2)
  alphas = np.logspace(-4, 4, 40)
  parameters = {'alpha': alphas}
  clf = GridSearchCV(ridge, parameters)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  y__pred = clf.predict(X__test)
  y__pred = np.exp(y__pred)

  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", clf.best_estimator_)
  print('Feature Coefficients: ', clf.best_estimator_.coef_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(clf.best_estimator_,X_train,y_train), 'sec')
  print()

  return clf.best_estimator_

# Correlation Ridge Regression Score
ridgeRegressionRemoveModelCorr = RidgeRegression(X_train_corr, y_train, X_test_corr, 'prediction_remove_ridge_corr.csv')
saveModel(ridgeRegressionRemoveModelCorr, 'ridgeRegressionRemoveModelCorr')

# PCA Ridge Regression Score
ridgeRegressionRemoveModelPCA = RidgeRegression(X_train_pca, y_train, X_test_pca, 'prediction_remove_ridge_pca.csv')
saveModel(ridgeRegressionRemoveModelPCA, 'ridgeRegressionRemoveModelPCA')

# SVD Ridge Regression Score
ridgeRegressionRemoveModelSVD = RidgeRegression(X_train_svd, y_train, X_test_svd, 'prediction_remove_ridge_svd.csv')
saveModel(ridgeRegressionRemoveModelSVD, 'ridgeRegressionRemoveModelSVD')

###Lasso Regression

In [None]:
def LassoRegression(X, y, X__test, fileName):
  lasso = Lasso(max_iter= 3000, tol = 0.2)
  alphas = np.logspace(-4, 4, 40)
  parameters = {'alpha': alphas}
  clf = GridSearchCV(lasso, parameters)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  y__pred = clf.predict(X__test)
  y__pred = np.exp(y__pred)

  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", clf.best_estimator_)
  print('Feature Coefficients: ', clf.best_estimator_.coef_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(clf.best_estimator_,X_train,y_train), 'sec')
  print()

  return clf.best_estimator_

# Correlation Ridge Regression Score
lassoRegressionRemoveModelCorr = LassoRegression(X_train_corr, y_train, X_test_corr, 'prediction_remove_lasso_corr.csv')
saveModel(lassoRegressionRemoveModelCorr, "lassoRegressionRemoveModelCorr")

# PCA Ridge Regression Score
lassoRegressionRemoveModelPCA = LassoRegression(X_train_pca, y_train, X_test_pca, 'prediction_remove_lasso_pca.csv')
saveModel(lassoRegressionRemoveModelPCA, 'lassoRegressionRemoveModelPCA')

# SVD Ridge Regression Score
lassoRegressionRemoveModelSVD = LassoRegression(X_train_svd, y_train, X_test_svd, 'prediction_remove_lasso_svd.csv')
saveModel(lassoRegressionRemoveModelSVD, "lassoRegressionRemoveModelSVD")

### Elastic Net Regression

In [None]:
def ElasticNetRegression(X, y, X__test, fileName):
  elasticNet = ElasticNet(max_iter=3000, tol=0.2)
  l1_ratioArr = [] #Array of l1_ratios which decides the contribution of l1 and l2 error
  for i in range(1, 11, 1):
    l1_ratioArr.append(i * 0.1)
  
  alphas = np.logspace(-4, 4, 40)
  parameters = {'l1_ratio': l1_ratioArr, 'alpha': alphas}

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  #By default cv = 5 and refit = true
  clf = GridSearchCV(estimator = elasticNet, param_grid = parameters)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  y__pred = clf.predict(X__test)
  y__pred = np.exp(y__pred)

  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", clf.best_estimator_)
  print('Feature Coefficients: ', clf.best_estimator_.coef_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(clf.best_estimator_,X_train,y_train), 'sec')
  print()

  return clf.best_estimator_

# Correlation ElasticNet Regression Score
elasticnetRegressionRemoveModelCorr = ElasticNetRegression(X_train_corr, y_train, X_test_corr, 'prediction_remove_elastic_corr.csv')
saveModel(elasticnetRegressionRemoveModelCorr, "elasticnetRegressionRemoveModelCorr")

# PCA ElasticNet Regression Score
elasticnetRegressionRemoveModelPCA = ElasticNetRegression(X_train_pca, y_train, X_test_pca, 'prediction_remove_elastic_pca.csv')
saveModel(elasticnetRegressionRemoveModelPCA, "elasticnetRegressionRemoveModelPCA")

# SVD ElasticNet Regression Score
elasticnetRegressionRemoveModelSVD = ElasticNetRegression(X_train_svd, y_train, X_test_svd, 'prediction_remove_elastic_svd.csv')
saveModel(elasticnetRegressionRemoveModelSVD, "elasticnetRegressionRemoveModelSVD")

### Random Forest Regressor

In [None]:
def RandomForestRegression(X, y, X__test, fileName):

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Number of trees in random forest
  # [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
  n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

  # Different criterions, MSE and MAE
  criterions = ["mse", "mae"]
  
  # Number of features to consider at every split
  max_features = ['auto', 'sqrt']
  
  # Maximum number of levels in tree
  max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
  max_depth.append(None)

  # Minimum number of samples required to split a node
  min_samples_split = [2, 5, 10]
  
  # Minimum number of samples required at each leaf node
  min_samples_leaf = [1, 2, 4]
  
  # Method of selecting samples for training each tree
  bootstrap = [True]
  
  # Create the random grid
  random_grid = {
                'n_estimators': n_estimators,  
                'criterion': criterions,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap
                 }

  rf = RandomForestRegressor()
  rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 40, random_state = 42)
  rf_random.fit(X_train, y_train)

  y_pred = rf_random.predict(X_test)

  y__pred = rf_random.predict(X__test)
  y__pred = np.exp(y__pred)

  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", rf_random.best_estimator_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(rf_random.best_estimator_,X_train,y_train), 'sec')
  print()

  return rf_random.best_estimator_

# Correlation Random Forerst Regression Score
randomForestRemoveModelCorr = RandomForestRegression(X_train_corr, y_train, X_test_corr, 'prediction_remove_rf_corr.csv')
saveModel(randomForestRemoveModelCorr, "randomForestRemoveModelCorr")

# PCA Random Forerst Regression Score
randomForestRemoveModelPCA = RandomForestRegression(X_train_pca, y_train, X_test_pca, 'prediction_remove_rf_pca.csv')
saveModel(randomForestRemoveModelPCA, "randomForestRemoveModelPCA")

# SVD Random Forerst Regression Score
randomForestRemoveModelSVD = RandomForestRegression(X_train_svd, y_train, X_test_svd, 'prediction_remove_rf_svd.csv')
saveModel(randomForestRemoveModelSVD, "randomForestRemoveModelSVD")

## Support Vector Machine

In [None]:
def SVMRegression(X, y, X__test, fileName):
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Create the parameter grid
  param_grid = {'C': [0.01, 0.05, 0.1, 0.2, 0.5, 0.9, 1, 1.2, 1.5 , 2, 5]}
  
  svr = SVR()
  clf =  GridSearchCV(estimator = SVR(), param_grid = param_grid, refit = True)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  y__pred = clf.predict(X__test)
  y__pred = np.exp(y__pred)

  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", clf.best_estimator_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(svr,X_train,y_train), 'sec')
  print()

  return clf.best_estimator_

# Correlation SVM Regression Score
svmRemoveModelCorr = SVMRegression(X_train_corr, y_train, X_test_corr, 'prediction_remove_svm_corr.csv')
saveModel(svmRemoveModelCorr, "svmRemoveModelCorr")

# PCA SVM Regression Score
svmRemoveModelPCA = SVMRegression(X_train_pca, y_train, X_test_pca, 'prediction_remove_svm_pca.csv')
saveModel(svmRemoveModelPCA, "svmRemoveModelPCA")

# SVD SVM Regression Score
svmRemoveModelSVD = SVMRegression(X_train_svd, y_train, X_test_svd, 'prediction_remove_svm_svd.csv')
saveModel(svmRemoveModelSVD, "svmRemoveModelSVD")

## XGBoost


In [None]:
def XGBoost(X, y, X__test, fileName):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Create the parameter grid
  param_grid = {'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],  
              'gamma': [10, 5, 1, 0.1, 0.01, 0.001, 0.0001],
              'max_depth': [2,3,4,5,6,7]}  
  
  xgboostRegressor = XGBRegressor(objective = 'reg:squarederror')
  clf = GridSearchCV(estimator = xgboostRegressor, param_grid = param_grid, refit = True)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)
  y__pred = clf.predict(X__test)
  y__pred = np.exp(y__pred)
  
  count = 0
  with open("test.csv",'r') as file:
    newfile = open(fileName,'w');
    data = csv.reader(file)
    for row in data:
        if row[0] == "Id":
            newfile.write("Id,SalePrice\n")
        else:
            newfile.write(row[0]+","+str(y__pred[count])+"\n")
            count += 1
    newfile.close()

  print("Best Parameter: ", clf.best_estimator_)
  get_metrics(X_test, y_test, y_pred)
  print('Time:', timeToTrain(xgboostRegressor,X_train,y_train), 'sec')
  print()

  return clf.best_estimator_

# Correlation SVM Regression Score
xgboostRemoveModelCorr = XGBoost(X_train_corr, y_train, X_test_corr, 'prediction_remove_xgBoost_corr.csv')
saveModel(xgboostRemoveModelCorr, "xgboostRemoveModelCorr")

# PCA SVM Regression Score
xgboostRemoveModelPCA = XGBoost(X_train_pca, y_train, X_test_pca, 'prediction_remove_xgBoost_pca.csv')
saveModel(xgboostRemoveModelPCA, "xgboostRemoveModelPCA")

# SVD SVM Regression Score
xgboostRemoveModelSVD = XGBoost(X_train_svd, y_train, X_test_svd, 'prediction_remove_xgBoost_svd.csv')
saveModel(xgboostRemoveModelSVD, "xgboostRemoveModelSVD")