## Deep Neural Network

In [1]:
# Import Files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from numpy import loadtxt
import keras
import kerastuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization


: 

: 

In [2]:
import pandas as pd

In [12]:
# Import pre-processed data
df_train = pd.read_csv("MIT Hackathon/train_final.csv")
df_validation = pd.read_csv("MIT Hackathon/validation_final.csv")
df_test = pd.read_csv("MIT Hackathon/test_final.csv")

In [13]:
# Split into training and testing x,y
y_train = df_train['Overall_Experience']
x_train = df_train.drop(columns = "Overall_Experience")
y_validation = df_validation['Overall_Experience']
x_validation = df_validation.drop(columns = "Overall_Experience")
x_train.shape, y_train.shape, x_validation.shape, y_validation.shape

((14365, 23), (14365,), (14345, 23), (14345,))

In [7]:
# Since DNN function has validation_split option, concat train and validation sets
train_final = pd.concat([df_train, df_validation], axis = 0)
y_train_final = train_final['Overall_Experience']
x_train_final = train_final.drop(columns = "Overall_Experience")

In [None]:
#Best Epochs: 25
#Batch Size = 32
#Best accuracy = 0.9541880

h_model2 = keras.Sequential()

h_model2.add(keras.layers.Dense(units=384, input_shape=(78,), activation='relu', name='dense_unique'))
h_model2.add(BatchNormalization())
h_model2.add(keras.layers.Dropout(0.4))
h_model2.add(keras.layers.Dense(units=288, input_shape=(78,), activation='relu', name='dense_unique2'))
h_model2.add(BatchNormalization())
h_model2.add(keras.layers.Dropout(0.4))
h_model2.add(keras.layers.Dense(384, activation='relu'))
h_model2.add(BatchNormalization())
h_model2.add(keras.layers.Dropout(0.3))
h_model2.add(keras.layers.Dense(1, activation='sigmoid'))
hp_learning_rate = 1e-4
h_model2.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), metrics=['accuracy'])

In [None]:
# Use GridSearchCV to hyperparameter tune the batch size and epochs
callbacks_tuned = [
    keras.callbacks.ModelCheckpoint(
        filepath= "parameter.keras",
        save_best_only=True,
        monitor="val_loss")
]

params={'batch_size':[100, 20, 50, 25, 32], 
        'epochs':[25, 50, 100, 200, 300, 400]
        }

h_model2_gs = GridSearchCV(estimator = h_model2, param_grid = params, cv = 10)

h_model2_gs.fit(x= x_train_final, y = y_train_final)

prediction_dnn = h_model2_gs.predict(df_test)

In [None]:
# Graphical representation of val_accuracy over time
accuracy = history_tuned.history["accuracy"]
val_accuracy = history_tuned.history["val_accuracy"]
loss = history_tuned.history["loss"]
val_loss = history_tuned.history["val_loss"]
epochs = range(1, len(accuracy) + 1)
plt.plot(epochs, accuracy, "bo", label="Training accuracy")
plt.plot(epochs, val_accuracy, "b", label="Validation accuracy")
plt.title("Training and validation accuracy")
plt.legend()
plt.show()
plt.figure()
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.legend()
plt.show()

In [None]:
# Final Implementation
ID_data = pd.read_csv("MIT Hackathon/Traveldata_test_(2).csv")
submission_df = pd.DataFrame()
submission_df['ID'] = ID_data['ID']
submission_df['Overall_Experience'] = prediction_dnn
submission_df['Overall_Experience'] = submission_df['Overall_Experience'].round(0).astype(int)
submission_df.to_csv("submission_data_DNN.csv", index=False)

## Support Vector Machine

In [8]:
from sklearn.svm import SVC

In [None]:
# Initialize SVM classifier
clf = SVC(kernel='linear')

# Fit data
clf = clf.fit(x_train_final, y_train_final)

In [None]:
# Model prediction
predictions = clf.predict(df_test)

In [None]:
#Hp Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ["linear", "poly", "sigmoid", "rbf"]}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(x_train_final, y_train_final)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(df_test)

In [None]:
# Final Implementation
ID_data = pd.read_csv("Traveldata_test_(2).csv")
submission_df = pd.DataFrame()
submission_df['ID'] = ID_data['ID']
submission_df['Overall_Experience'] = grid_predictions
submission_df['Overall_Experience'] = submission_df['Overall_Experience'].round(0).astype(int)
submission_df.to_csv("submission_data_SVM.csv", index=False)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Hyperparameter tuning on multiple RF parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train_final, y_train_final)

In [None]:
rf_random.best_params_

In [None]:
prediction_tuned = rf_random.predict()

In [None]:
# Check feature importance
importances = rf.feature_importances_

indices = np.argsort(importances)

plt.figure(figsize = (10, 10))

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color = 'violet', align = 'center')

plt.yticks(range(len(indices)), [features[i] for i in indices])

plt.xlabel('Relative Importance')

plt.show()

In [None]:
# Final Implementation
ID_data = pd.read_csv("Traveldata_test_(2).csv")
submission_df = pd.DataFrame()
submission_df['ID'] = ID_data['ID']
submission_df['Overall_Experience'] = prediction_tuned
submission_df['Overall_Experience'] = submission_df['Overall_Experience'].round(0).astype(int)
submission_df.to_csv("submission_data_RF.csv", index=False)

## Ensemble Learning

In [None]:
submission_RF = pd.read_csv("submission_data_RF.csv")
submission_SVM = pd.read_csv("submission_data_SVM.csv")
submission_DNN = pd.read_csv("submission_data_DNN.csv")

In [None]:
submission_RF.rename(columns={'Overall_Experience': 'RF_Model'}, inplace=True)
submission_SVM.rename(columns={'Overall_Experience': 'SVM_Model'}, inplace=True)
submission_DNN.rename(columns={'Overall_Experience': 'DNN_Model'}, inplace=True)

In [None]:
submission_RF.drop(columns = "ID", inplace = True)
submission_SVM.drop(columns = "ID", inplace = True)
submission_DNN.drop(columns = "ID", inplace = True)

In [None]:
comparison_table = pd.concat([submission_RF, submission_SVM, submission_DNN], axis = 1)

In [None]:
comparison_table['Final_Pred'] = comparison_table.iloc[:, -5:].sum(axis=1)

In [None]:
comparison_table['Overall_Experience'] = comparison_table['Final_Pred'].apply(lambda x: 1 if x >= 2 else 0) 

In [None]:
comparison_table.drop(columns = ["RF_Model","SVM_Model","DNN_Model"], inplace = True)