In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_test_id = df_test["PassengerId"]

# Convert Unique Non-Numerical Entries (to Numerical)

In [3]:
"""
Replaces identified values, in the provided dataframe, with given replacement values
dataframe : pandas.Dataframe
vals_to_replace : list
replacements : list
"""
def replace_vals(dataframe, vals_to_replace, replacements):
    dataframe = dataframe.replace(to_replace = vals_to_replace, value = replacements)
    return dataframe

In [4]:
df = df.drop(["Name"], axis=1)
df_test = df_test.drop(["Name"], axis=1)

df = replace_vals(df, [True, False], [1.0, 0.0])
df_test = replace_vals(df_test, [True, False], [1.0, 0.0])

homes = ["Europa", "Earth", "Mars"]
destinations = ["TRAPPIST-1e", "PSO J318.5-22", "55 Cancri e"]

df = replace_vals(df, homes, [0.0, 1.0, 2.0])
df = replace_vals(df, destinations, [0.0, 1.0, 2.0])

df_test = replace_vals(df_test, homes, [0.0, 1.0, 2.0])
df_test = replace_vals(df_test, destinations, [0.0, 1.0, 2.0])

In [5]:
df[['Group','People']] = df['PassengerId'].str.split('_',expand=True)
df = df.drop(['PassengerId'], axis=1)
df[['Deck','Num', 'Side']] = df['Cabin'].str.split('/',expand=True)
df = df.drop(['Cabin'], axis=1)

df_test[['Group','People']] = df_test['PassengerId'].str.split('_',expand=True)
df_test = df_test.drop(['PassengerId'], axis=1)
df_test[['Deck','Num', 'Side']] = df_test['Cabin'].str.split('/',expand=True)
df_test = df_test.drop(['Cabin'], axis=1)

In [6]:
decks = ['B', 'F', 'A', 'G', 'E', 'D', 'C', 'T']
sides = ['P', 'S']

df = replace_vals(df, decks, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
df_test = replace_vals(df_test, decks, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])

df = replace_vals(df, sides, [1.0, 2.0])
df_test = replace_vals(df_test, sides, [1.0, 2.0])

# Filling in Missing Data (to Avoid Information Loss)

In [7]:
headers = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

"""
Helper functions to pseudo-randomly fill missing values for continuous data
"""
codex = {}
for i in headers:
    vals = []
    vals.append(int(df[i].mean()))
    vals.append(df[i].median())
    vals.append(int(df[i].max()))
    codex[i] = vals
    
def crude_random(target, num):
    if num == 0.0:
        return codex[target][0]
    elif num == 1.0:
        return codex[target][1]
    else:
        return random.choice(range(0,codex[target][2]))
    
codex_test = {}
for i in headers:
    vals = []
    vals.append(int(df_test[i].mean()))
    vals.append(df_test[i].median())
    vals.append(int(df_test[i].max()))
    codex_test[i] = vals
    
def crude_random_test(target, num):
    if num == 0.0:
        return codex_test[target][0]
    elif num == 1.0:
        return codex_test[target][1]
    else:
        return random.choice(range(0,codex_test[target][2]))

In [8]:
df_headers = df[headers]
df = df.drop(headers, axis=1)

for i in headers:
    df_temp = df_headers[i].replace(to_replace=np.nan, value=crude_random(i, random.choice([0.0, 1.0, 2.0])))
    df = pd.concat([df, df_temp], axis=1)
    
df_test_headers = df_test[headers]
df_test = df_test.drop(headers, axis=1)

for i in headers:
    df_temp = df_test_headers[i].replace(to_replace=np.nan, value=crude_random_test(i, random.choice([0.0, 1.0, 2.0])))
    df_test = pd.concat([df_test, df_temp], axis=1)

In [9]:
df['Group'] = df['Group'].astype(float)
df['Num'] = df['Num'].astype(float)
df['People'] = df['People'].astype(float)

df_test['Group'] = df_test['Group'].astype(float)
df_test['Num'] = df_test['Num'].astype(float)
df_test['People'] = df_test['People'].astype(float)

In [10]:
"""
Fills missing non-continuous data points under specified columns in the given dataframe 
dataframe : pandas.Dataframe
column_names : list
replacements : list
"""
def fill_nans(dataframe, column_names, replacements):
    temp = dataframe[column_names]
    dataframe = dataframe.drop(column_names, axis=1)
    temp = temp.replace(to_replace = np.nan, value=random.choice(replacements))
    dataframe = pd.concat([dataframe, temp], axis=1)
    return dataframe

In [11]:
two_values = ["CryoSleep", "VIP"]
three_values = ["HomePlanet", "Destination"]
no_cabins = ["Deck", "Num", "Side"]

df = fill_nans(df, two_values, [0.0, 1.0])
df = fill_nans(df, three_values, [0.0, 1.0, 2.0])
df = fill_nans(df, no_cabins, [0.0])

df_test = fill_nans(df_test, two_values, [0.0, 1.0])
df_test = fill_nans(df_test, three_values, [0.0, 1.0, 2.0])
df_test = fill_nans(df_test, no_cabins, [0.0])

# Check All Information Preserved

In [12]:
df.describe()

Unnamed: 0,Transported,Group,People,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CryoSleep,VIP,HomePlanet,Destination,Deck,Num,Side
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,0.503624,4633.389624,1.517773,28.810882,220.009318,938.868975,407.242149,304.588865,622.03359,0.349362,0.022892,0.934085,0.505694,3.438169,586.624065,1.470378
std,0.500016,2671.028856,1.054241,14.339536,660.51905,3645.965586,1606.720763,1125.562559,2415.778883,0.476796,0.149568,0.682874,0.814966,1.809941,513.880084,0.543084
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2319.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,152.0,1.0
50%,1.0,4630.0,1.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,407.0,1.0
75%,1.0,6883.0,2.0,37.0,41.0,118.0,45.0,53.0,71.0,1.0,0.0,1.0,1.0,4.0,983.0,2.0
max,1.0,9280.0,8.0,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1.0,1.0,2.0,2.0,8.0,1894.0,2.0


In [13]:
df_test.describe()

Unnamed: 0,Group,People,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CryoSleep,VIP,HomePlanet,Destination,Deck,Num,Side
count,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0
mean,4639.296469,1.498714,29.431377,215.062427,439.472294,177.288754,445.990881,594.447042,0.382745,0.039046,0.961655,0.483984,3.426233,595.912322,1.465981
std,2716.197368,1.018221,14.97587,601.914503,1508.609203,554.357253,1436.519802,2398.014471,0.486114,0.193727,0.685223,0.801973,1.811638,517.198106,0.543759
min,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2249.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,162.0,1.0
50%,4639.0,1.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,416.0,1.0
75%,7030.0,2.0,38.0,48.0,143.0,51.0,83.0,53.0,1.0,0.0,1.0,1.0,4.0,1012.0,2.0
max,9277.0,8.0,79.0,11567.0,25273.0,8292.0,19844.0,22272.0,1.0,1.0,2.0,2.0,8.0,1890.0,2.0


# Prepare Data to Train Models

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

train_target = df["Transported"]
train_features = df.drop(["Transported", "Group"], axis=1)
test_features = df_test.drop(["Group"],axis=1)

st_scaler = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.05)
x_train = st_scaler.fit_transform(x_train) 
x_test = st_scaler.transform(x_test)

st_scaler2 = StandardScaler()
fit_tr = st_scaler2.fit_transform(train_features)
to_test = st_scaler2.transform(test_features)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

"""
RandomizedSearchCV used to identify the set of parameters, for a given model, that produces the highest f1 score
model : object
distributions : dictionary of lists
"""
def best_params(model, distributions):
    cvFold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    clf = RandomizedSearchCV(scoring='f1_micro', estimator=model, cv=cvFold, param_distributions=distributions)
    search = clf.fit(x_train, y_train)
    result = search.best_params_
    print(result)
    return result

"""
GridSearchCV used to identify the set of parameters, for a given model, that produces the highest f1 score
model : object
grid : dictionary of lists
"""
def best_params_grid(model, grid):
    cvFold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    clf = GridSearchCV(scoring='f1_micro', estimator=model, cv=cvFold, param_grid=grid)
    search = clf.fit(x_train, y_train)
    result = search.best_params_
    print(result)
    return result

"""
Provided model is used to predict unlabelled data. Writes the results to a .csv file of given name
model : object
name : string
"""
def create_file(model, name):
    predictions = pd.DataFrame(model.predict(to_test))
    predictions = predictions.replace(to_replace=[1.0, 0.0],
           value=[True, False])
    predictions = predictions.rename({0: "Transported"}, axis=1)
    pd.concat([df_test_id, predictions],axis = 1).to_csv(name+".csv",index = False)

# Training Models with Optimized Parameters

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
distributions = {'n_estimators': range(1, 200),
                'max_depth': range(1, 20)}
params = best_params(GradientBoostingClassifier(), distributions)
gbc = GradientBoostingClassifier(n_estimators = params['n_estimators'], max_depth = params['max_depth'])
gbc.fit(x_train, y_train)

print(f1_score(y_train, gbc.predict(x_train)))
print(f1_score(y_test, gbc.predict(x_test)))

{'n_estimators': 106, 'max_depth': 4}
0.8481415929203541
0.8244897959183672


In [17]:
from sklearn.ensemble import ExtraTreesClassifier
distributions = {'n_estimators': range(1, 100),
                'max_depth': range(1, 20)}
params = best_params(ExtraTreesClassifier(), distributions)
et = ExtraTreesClassifier(n_estimators = params['n_estimators'], max_depth = params['max_depth'])
et.fit(x_train, y_train)

print(f1_score(y_train, et.predict(x_train)))
print(f1_score(y_test, et.predict(x_test)))

{'n_estimators': 37, 'max_depth': 18}
0.9282933567750848
0.7747747747747746


In [18]:
from sklearn.ensemble import RandomForestClassifier
distributions = {'n_estimators': range(1, 100),
                'max_depth': range(1, 20)}
params = best_params(RandomForestClassifier(), distributions)
rf = RandomForestClassifier(n_estimators = params['n_estimators'], max_depth = params['max_depth'])
rf.fit(x_train, y_train)

print(f1_score(y_train, rf.predict(x_train)))
print(f1_score(y_test, rf.predict(x_test)))

{'n_estimators': 69, 'max_depth': 12}
0.904710535778497
0.803347280334728


In [19]:
from sklearn.tree import DecisionTreeClassifier
distributions = {'criterion': ['gini', 'entropy'],
                'max_depth': range(1, 20)}
params = best_params_grid(DecisionTreeClassifier(), distributions)
dtc = DecisionTreeClassifier(criterion = params['criterion'], max_depth = params['max_depth'])
dtc.fit(x_train, y_train)

print(f1_score(y_train, dtc.predict(x_train)))
print(f1_score(y_test, dtc.predict(x_test)))

{'criterion': 'gini', 'max_depth': 6}
0.8054288321167884
0.7857142857142858


In [22]:
from sklearn.neural_network import MLPClassifier
distributions = {'activation': ['logistic', 'tanh', 'relu'],
                'max_iter': [200, 300, 400, 500]}
params = best_params_grid(MLPClassifier(early_stopping=True), distributions)
mlp = MLPClassifier(activation = params['activation'], max_iter = params['max_iter'] )
mlp.fit(x_train, y_train)

print(f1_score(y_train, mlp.predict(x_train)))
print(f1_score(y_test, mlp.predict(x_test)))

{'activation': 'relu', 'max_iter': 300}
0.8453807967066231
0.8118393234672303




In [20]:
from sklearn.ensemble import AdaBoostClassifier
distributions = {'base_estimator': [et, rf, dtc]}
params = best_params_grid(AdaBoostClassifier(), distributions)
ada = AdaBoostClassifier(base_estimator = params['base_estimator'])
ada.fit(x_train, y_train)

print(f1_score(y_train, ada.predict(x_train)))
print(f1_score(y_test, ada.predict(x_test)))

{'base_estimator': RandomForestClassifier(max_depth=12, n_estimators=69)}
0.9998792124652737
0.8193832599118942


In [21]:
from sklearn.ensemble import BaggingClassifier
distributions = {'base_estimator': [gbc, et, rf, dtc]}
params = best_params_grid(BaggingClassifier(), distributions)
bag = BaggingClassifier(base_estimator = params['base_estimator'])
bag.fit(x_train, y_train)

print(f1_score(y_train, bag.predict(x_train)))
print(f1_score(y_test, bag.predict(x_test)))

{'base_estimator': GradientBoostingClassifier(max_depth=4, n_estimators=106)}
0.842067565954399
0.8329896907216495


# Picking Models with Higher Performance

In [23]:
# models with f1 score > 0.8
create_file(gbc, "gbc_106_4")
create_file(rf, "rf_69_12")
create_file(mlp, "mlp_relu300")
create_file(ada, "ada_rf_69_12")
create_file(bag, "bag_gbc_106_4")