In [None]:
# Supress unnecessary warnings so that presentation looks clean

import warnings
warnings.filterwarnings('ignore')

# https://www.kaggle.com/sharmasanthosh/exploratory-study-on-ml-algorithms

In [None]:
# Importing the libraries

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
dataset_train = pd.read_csv("train.csv")
dataset_test = pd.read_csv("test.csv")

ID_test = dataset_test['id']

# Dropping unnecessary columns

dataset_test.drop('id', axis=1, inplace=True)


#Display the first five rows to get a feel of the data
print(dataset_train.head(5))

#Learning : cat1 to cat116 contain alphabets

In [None]:
# Size of the dataframe

print(dataset_train.shape) 

# We can see that there are 188318 instances/observations having 132 attributes

In [None]:
# Dropping unnecessary column 'id' in the training set since it just has serial numbers. Not useful
# in the prediction process.

dataset_train = dataset_train.iloc[:, 1:] 



# We dont need index 0. Just all the remaining indexes.

# Statistical description

print(dataset_train.describe())

# Learning :
# No attribute in continuous columns is missing as count is 188318 for all, all rows can be used
# No negative values are present. Tests such as chi2 can be used.
# Statistics not displayed for categorical data.

In [None]:
# Skewness of the distribution

print(dataset_train.skew())

# Values close to 0 show less skew.
# loss shows the highest skew. Let us visualize it.

In [None]:
# We will visualize all the continuous attributes using Violin Plot - a combination of box and density plots

# Range of features considered
split = 116

# Number of features considered
size = 15

# Creating a dataframe with only continuous features
data = dataset_train.iloc[:, split:]
# print(data)
print(data.head(5))


In [None]:
# Getting the names of all the columns
cols=data.columns 

# Plot violin for all attributes in a 7x2 grid
n_cols = 2
n_rows = 7

for i in range(n_rows):
    fg, ax = plt.subplots(nrows=1,ncols=n_cols,figsize=(12,8))
    for j in range(n_cols):
        sns.violinplot(y=cols[i*n_cols+j], data=dataset_train, ax=ax[j])
        

# Learning:        
#cont1 has many values close to 0.5
#cont2 has a pattern where there a several spikes at specific points
#cont5 has many values near 0.3
#cont14 has a distinct pattern. 0.22 and 0.82 have a lot of concentration
#loss distribution must be converted to normal

In [None]:
# DATA TRANSFORMATION 
# ---> Skew Correction


#log1p function applies log(1+x) to all elements of the column
dataset_train["loss"] = np.log1p(dataset_train["loss"])
#visualize the transformed column
sns.violinplot(data=dataset_train,y="loss")  
plt.show()

#Plot shows that skew is corrected to a large extent
print(dataset_train["loss"].head(5))

In [None]:
# DATA INTERACTION
# ---> Correlation

# Correlation tells relation between two attributes.
# Correlation requires continuous data. Therefore, we can ignore categorical data.

# Calculating Pearson coefficient for all combinations

data_corr = data.corr()
print(data_corr)

# Setting the threshold to select only highly correlated attributes

threshold = 0.5

# List of pairs along with correlation above threshold

corr_list = []

print("##########################")
print(data_corr.iloc[0,1])

# Searching for the highly correlated pairs

for i in range(0, size): #for "size" features
    for j in range(i+1, size):
        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
            corr_list.append([data_corr.iloc[i,j],i,j]) # stores coeffient and appropriate column indexes

            
# Sorting to show higher ones first            

s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0])) # See key function, https://docs.python.org/3/howto/sorting.html


print("##########################")

# Printing coefficients and column names

for v, i, j in s_corr_list:
    print("%s and %s = %.2f" % (cols[i],cols[j],v))
    
# LEARNING

# We see there is a strong correlation between the following pairs: 
# This represents an opportunity to reduce the feature set through transformations such as PCA


In [None]:
# Scatter plot of all the highly correlated pairs
for v, i, j in s_corr_list:
    sns.pairplot(dataset_train, size=6, x_vars=cols[i], y_vars=cols[j])
    plt.show

# cont11 and cont12 give an almost perfect linear pattern
# cont1 and cont9  give an almost perfect linear pattern
# cont6 and cont10 also show a very good combination

#therefore one of these can be removed from each pair

In [None]:
# DATA VISUALIZATION
# ---> Categorical attributes

# Names of all the categorical columns

cols = dataset_train.columns

# Plot count plot for all attributes in a 29x4 (116 in total) grid


n_rows = 29
n_cols = 4

for i in range(n_rows):
    fg, ax = plt.subplots(nrows=1,ncols=n_cols,figsize=(16,8))
    for j in range(n_cols):
        sns.countplot(x=cols[i*n_cols+j], data=dataset_train, ax=ax[j])
       
    
# LEARNING
# cat1 to cat72 have only two labels A and B. 
# In most of the cases, B has very few entries.
# cat73 to cat 108 have more than two labels
# cat109 to cat116 have many labels



In [None]:
print(dataset_train.iloc[2:3, :-15].values) # debugging, manually writing to see correct index of columns

In [None]:
print(dataset_train.shape)

In [None]:
# DATA PREPARATION

# Turning cat1 to cat116 into numerical data.
# One-hot encoding converts an attribute to a binary vector.

# Variable to store the list of variables for an attribute in the train and test set

labels = []

# Making sure we account for all of the unique variables that show up in both the training and test set provided. 
# For instance, this ensures we dont run into any unforeseen variables when going from the training set to test set.
for i in range(0, split):
    train = dataset_train[cols[i]].unique()
    test = dataset_test[cols[i]].unique()
    labels.append(list(set(train) | set(test))) #note the OR operator!

print("labels %s" % labels)
    
#del dataset_test


# Importing OneHotEncoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#One hot encode all categorical attributes
cats = []
for i in range(0, split):
    #Label encode
    label_encoder = LabelEncoder()
    label_encoder.fit(labels[i])
    feature = label_encoder.transform(dataset_train.iloc[:,i])
    feature = feature.reshape(dataset_train.shape[0], 1)
    #One hot encode
    onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
    feature = onehot_encoder.fit_transform(feature)
    cats.append(feature)


    
print("################")
print("List of 1D array of cats--> %s" % cats)
print("################")
# Make a 2D array from a list of 1D arrays
encoded_cats = np.column_stack(cats)
print("2D array of cats--> %s" % encoded_cats)
print("################")
print(encoded_cats.shape)
print("################")

#Concatenate encoded attributes with continuous attributes
dataset_train_encoded = np.concatenate((encoded_cats,dataset_train.iloc[:,split:].values),axis=1)
del cats
del dataset_train
del encoded_cats


# Print the shape of the encoded data
print(dataset_train_encoded.shape)
print("################")




In [None]:
# DATA PREPARATION
# ---> Splitting the data into train and test/valid

In [None]:
# Getting the number of rows and columns

r, c = dataset_train_encoded.shape


# Creating an array which has indexes of columns
i_cols = []

for i in range(0, c-1):
    i_cols.append(i)

# Y is the target column, X has the rest
X = dataset_train_encoded[:, 0:(c-1)]
Y = dataset_train_encoded[:, (c-1)]

del dataset_train_encoded


# Validation chunk size
val_size = 0.1

# Using a common seed in all experiments so that same chunk is used for validation
seed = 0

# Splitting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=val_size, random_state=seed)

del X
del Y


# All features
X_all = []


# List of combinations
comb = []

# Dictionary to store the Mean Absolute Error for all algorithms
mae = []


#Scoring parameter
from sklearn.metrics import mean_absolute_error

#Add this version of X to the list
n = "All"

#X_all.append([n, X_train,X_val,i_cols])
X_all.append([n, i_cols])


In [None]:
print(X_all)     # all the columns along with dummy vars
print(type(X_all))

In [None]:
# LINEAR REGRESSION (Linear Algo)



# Fitting Linear Regression to the dataset
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(n_jobs=-1) # using all processors
algo = "LR"

# Accuracy of the model using all features
for name, i_cols_list in X_all:
    print(name)
    lin_reg.fit(X_train[:, i_cols_list], y_train) #fitting all features to the target column
    result = mean_absolute_error(np.expm1(y_test), np.expm1(lin_reg.predict(X_test[:,i_cols_list])))
    mae.append(result)
    print(name + " %s" % result)

comb.append(algo)
print(comb)

'''
MODEL OUTPUT: 

All 1276.0276564234468
['LR']
'''    

# #Plot the MAE of all combinations
# fig, ax = plt.subplots()
# plt.plot(mae)
# #Set the tick names to names of combinations
# ax.set_xticks(range(len(comb)))
# ax.set_xticklabels(comb,rotation='vertical')
# #Plot the accuracy for all combinations
# plt.show()    

In [None]:
# KNN (Non-linear Algo)


# Evaluation of various combinations of KNN

# Fitting Classifier to the Training set
from sklearn.neighbors import KNeighborsRegressor     
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html



# Add the N value to the below list if you want to run the algo

n_list = np.array([]) #note, when the list is empty, the algo doesnt run
'''
With n_list = np.array([5])

All 1434.788795356784
['LR', 'KNN 5']
'''

'''
With n_list = np.array([2])

All 1526.7442802258656
['LR', 'KNN 2']
'''

# we can use multiple values into n_list if we want to search for the optimal n_neighbors. However, xgboost is usally the best for parameter tuning.
for n_neighbors in n_list:
    # Setting the base model
    regressor = KNeighborsRegressor(n_neighbors=n_neighbors,n_jobs=-1)
    
    algo = "KNN"

    #Accuracy of the model using all features
    for name, i_cols_list in X_all:
        regressor.fit(X_train[:, i_cols_list], y_train) #fitting all features to the target column
        result = mean_absolute_error(np.expm1(y_test), np.expm1(regressor.predict(X_test[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
    comb.append(algo + " %s" % n_neighbors)
    
print(comb)


# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1527)
    comb.append("KNN" + " %s" % 2 )



##Set figure size, this figure compares mae for all of the algorithms ran

#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Very high computation time
#Best estimated performance is 1745 for n=1



# LEARNING:
# KNN 5 performed the best. Lowest MAE.

In [None]:
# CART (Non-linear Algo)


 #Evaluation of various combinations of CART

#Import the library
from sklearn.tree import DecisionTreeRegressor

#Add the max_depth value to the below list if you want to run the algo
d_list = np.array([])

for max_depth in d_list:
    #Set the base model
    model = DecisionTreeRegressor(max_depth=max_depth,random_state=seed)
    
    algo = "CART"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % max_depth )

    
# since we know the outcome, we can skip the algorithm and append the result
if (len(d_list)==0):
    mae.append(1741)
    comb.append("CART" + " %s" % 5 )    
    
    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#High computation time
#Best estimated performance is 1741 for depth=5



In [None]:
# SVM (Non-linear Algo)

#Import the library
from sklearn.svm import SVR

#Add the C value to the below list if you want to run the algo
c_list = np.array([])

for C in c_list:
    #Set the base model
    model = SVR(C=C)
    
    algo = "SVM"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % C )

    
    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#very very high computation time, not running

In [None]:
# Bagged Decision Trees (Bagging)


#Evaluation of various combinations of Bagged Decision Trees



#Import the library
from sklearn.ensemble import BaggingRegressor
#from sklearn.tree import DecisionTreeRegressor

#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Setting the base model
    model = BaggingRegressor(n_jobs=-1,n_estimators=n_estimators)
    
    algo = "Bag"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )

    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#very high computation time, not running

In [None]:
# Random Forest (Bagging)


# Evaluation of various combinations of RandomForest

#Import the library
from sklearn.ensemble import RandomForestRegressor

#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Set the base model
    model = RandomForestRegressor(n_jobs=-1,n_estimators=n_estimators,random_state=seed)
    
    algo = "RF"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )

    
# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1213)
    comb.append("RF" + " %s" % 50 )    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Best estimated performance is 1213 when the number of estimators is 50

In [None]:
# Extra Trees (Bagging)


#Evaluation of various combinations of ExtraTrees

#Import the library
from sklearn.ensemble import ExtraTreesRegressor


#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Set the base model
    model = ExtraTreesRegressor(n_jobs=-1,n_estimators=n_estimators,random_state=seed)
    
    algo = "ET"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )

    
    
# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1254)
    comb.append("ET" + " %s" % 100 )    
    
    

##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Best estimated performance is 1254 for 100 estimators

In [None]:




#Evaluation of various combinations of AdaBoost

#Import the library
from sklearn.ensemble import AdaBoostRegressor

#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Set the base model
    model = AdaBoostRegressor(n_estimators=n_estimators,random_state=seed)
    
    algo = "Ada"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )
    
    
# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1678)
    comb.append("Ada" + " %s" % 100 )    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Best estimated performance is 1678 with n=100

In [None]:
# Stochastic Gradient Boosting (Boosting)




#Evaluation of various combinations of SGB

#Import the library
from sklearn.ensemble import GradientBoostingRegressor

#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Set the base model
    model = GradientBoostingRegressor(n_estimators=n_estimators,random_state=seed)
    
    algo = "SGB"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )

# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1278)
    comb.append("SGB" + " %s" % 50 )    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Best estimated performance is ?

In [None]:
 #XGBoost
    
    
    
    
    
#Evaluation of various combinations of XGB

#Import the library
from xgboost import XGBRegressor

#Add the n_estimators value to the below list if you want to run the algo
n_list = np.array([])

for n_estimators in n_list:
    #Set the base model
    model = XGBRegressor(n_estimators=n_estimators,seed=seed)
    
    algo = "XGB"

    #Accuracy of the model using all features
    for name,i_cols_list in X_all:
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo + " %s" % n_estimators )

# since we know the outcome, we can skip the algorithm and append the result
if (len(n_list)==0):
    mae.append(1169)
    comb.append("XGB" + " %s" % 1000 )    
    
##Set figure size
#plt.rc("figure", figsize=(25, 10))

##Plot the MAE of all combinations
#fig, ax = plt.subplots()
#plt.plot(mae)
##Set the tick names to names of combinations
#ax.set_xticks(range(len(comb)))
#ax.set_xticklabels(comb,rotation='vertical')
##Plot the accuracy for all combinations
#plt.show()    

#Best estimated performance is 1169 with n=1000

In [None]:
#MLP (Deep Learning)


#Evaluation of various combinations of multi-layer perceptrons

#Import libraries for deep learning
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

# define baseline model
def baseline(v):
     # create model
     model = Sequential()
     model.add(Dense(v*(c-1), input_dim=v*(c-1), init='normal', activation='relu'))
     model.add(Dense(1, init='normal'))
     # Compile model
     model.compile(loss='mean_absolute_error', optimizer='adam')
     return model

# define smaller model
def smaller(v):
     # create model
     model = Sequential()
     model.add(Dense(v*(c-1)/2, input_dim=v*(c-1), init='normal', activation='relu'))
     model.add(Dense(1, init='normal', activation='relu'))
     # Compile model
     model.compile(loss='mean_absolute_error', optimizer='adam')
     return model

# define deeper model
def deeper(v):
 # create model
 model = Sequential()
 model.add(Dense(v*(c-1), input_dim=v*(c-1), init='normal', activation='relu'))
 model.add(Dense(v*(c-1)/2, init='normal', activation='relu'))
 model.add(Dense(1, init='normal', activation='relu'))
 # Compile model
 model.compile(loss='mean_absolute_error', optimizer='adam')
 return model

# Optimize using dropout and decay
from keras.optimizers import SGD
from keras.layers import Dropout
from keras.constraints import maxnorm

def dropout(v):
    #create model
    model = Sequential()
    model.add(Dense(v*(c-1), input_dim=v*(c-1), init='normal', activation='relu',W_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(v*(c-1)/2, init='normal', activation='relu', W_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(1, init='normal', activation='relu'))
    # Compile model
    sgd = SGD(lr=0.1,momentum=0.9,decay=0.0,nesterov=False)
    model.compile(loss='mean_absolute_error', optimizer=sgd)
    return model

# define decay model
def decay(v):
    # create model
    model = Sequential()
    model.add(Dense(v*(c-1), input_dim=v*(c-1), init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation='relu'))
    # Compile model
    sgd = SGD(lr=0.1,momentum=0.8,decay=0.01,nesterov=False)
    model.compile(loss='mean_absolute_error', optimizer=sgd)
    return model


est_list = []
#uncomment the below if you want to run the algo
#est_list = [('MLP',baseline),('smaller',smaller),('deeper',deeper),('dropout',dropout),('decay',decay)]

for name, est in est_list:
 
    algo = name

    #Accuracy of the model using all features
    for m,i_cols_list in X_all:
        model = KerasRegressor(build_fn=est, v=1, nb_epoch=10, verbose=0)
        model.fit(X_train[:,i_cols_list],Y_train)
        result = mean_absolute_error(np.expm1(Y_val), np.expm1(model.predict(X_val[:,i_cols_list])))
        mae.append(result)
        print(name + " %s" % result)
        
    comb.append(algo )
    
    
# since we know the outcome, we can skip the algorithm and append the result
if (len(est_list)==0):
    mae.append(1168)
    comb.append("MLP" + " baseline" )    
    
    
    
print("mae--> %s" % mae)
print("comb--> %s" % comb)
##Set figure size
plt.rc("figure", figsize=(25, 10))

#Plot the MAE of all combinations
fig, ax = plt.subplots()
plt.plot(mae)
#Set the tick names to names of combinations
ax.set_xticks(range(len(comb)))
ax.set_xticklabels(comb,rotation='vertical')
#Plot the accuracy for all combinations
plt.show()    

#Best estimated performance is MLP=1168

In [None]:
'''
Since XGBRegressor is showing the best performance, we can select it as our best model. Therefore, we now need to finalize the model with all of the avialable data.
'''


# note, X_train and X_test are both coming from the training set CSV. axis=0 is stacking rows on top of one another.
X = np.concatenate((X_train,X_test), axis=0) 
del X_train
del X_test
Y = np.concatenate((y_train,y_test),axis=0)
del y_train
del y_test

print("I am here 0 - debug")


n_estimators = 1000

#Best model definition
best_model = XGBRegressor(n_estimators=n_estimators,seed=seed)
print("I am here 0.0 - debug")
best_model.fit(X,Y)
print("I am here 0.1 - debug")
del X
del Y
#Read test dataset
dataset_test = pd.read_csv("test.csv")
print("I am here 0.2 - debug")
#Drop unnecessary columns
ID = dataset_test['id']
dataset_test.drop('id',axis=1,inplace=True)

#One hot encode all categorical attributes
cats = []
print("I am here 1 - debug")
for i in range(0, split):
    # label encoding
    label_encoder = LabelEncoder()
    label_encoder.fit(labels[i])
    feature = label_encoder.transform(dataset_test.iloc[:,i])
    feature = feature.reshape(dataset_test.shape[0], 1)
    #One hot encoding
    onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
    feature = onehot_encoder.fit_transform(feature)
    cats.append(feature)

print("I am here 2 - debug")
# Making a 2D array from a list of 1D arrays
encoded_cats = np.column_stack(cats)
del cats

# Concatenating encoded attributes with continous attributes
X_test = np.concatenate((encoded_cats, dataset_test.iloc[:,split:].values), axis=1)
print("I am here 3 - debug")
del encoded_cats
del dataset_test

# Making predictions using the best model now
predictions = np.expm1(best_model.predict(X_test))

with open("submission.csv", "w") as subfile:
    print("I am here 4 - debug")
    subfile.write("id, loss\n") #column headers
    for i, pred in enumerate(list(predicitions)):
        subfile.write("%s,%s\n" % (ID[i], pred)) 


print("I am here 5 - debug")
