In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [144]:
train = pd.read_csv("train.csv")

In [145]:
def process_family(df):
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    df['Single'] = df['FamilySize'].map(lambda s:1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s:1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s:1 if s >= 5 else 0)
    return df

In [146]:
train_mod_data = process_family(train)

In [147]:
def process_embarked(df):
    df.Embarked.fillna('S', inplace=True)
    df_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, df_dummies], axis=1)
    df.drop('Embarked', axis = 1, inplace=True)
    return df

In [148]:
train_mod_data = process_embarked(train_mod_data)

In [149]:
def process_cabin(df):
    df.Cabin.fillna('U', inplace=True)
    print(df['Cabin'])
    df['Cabin'] = df['Cabin'].map(lambda c:c[0])
    df_dummies = pd.get_dummies(df['Cabin'], prefix='Cabin')
    df = pd.concat([df, df_dummies], axis=1)
    df.drop('Cabin', axis=1, inplace=True)
    return df

In [150]:
train_mod_data = process_cabin(train_mod_data)

0         U
1       C85
2         U
3      C123
4         U
       ... 
886       U
887     B42
888       U
889    C148
890       U
Name: Cabin, Length: 891, dtype: object


In [151]:
train_mod_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,1,0,0,0,0,0,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,1,0,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,1,0,0,0,0,0,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,1,0,0,1,0,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,1,0,0,0,0,0,0,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,1,0,1,0,0,0,0,0,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,1,0,0,0,0,0,0,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,0,0,0,1,0,0,0,0,0,0


In [152]:
titles = set()
for names in train_mod_data['Name']:
    titles.add(names.split(',')[1].split('.')[0].strip())

In [132]:
titles

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [153]:
Title_Dict = {
    'Capt':'Officer',
    'Col' : 'Officer',
    'Don' : 'Officer',
    'Dr' : 'Royalty', 
    'Jonkheer': 'Royalty',
    'Lady': 'Royalty',
    'Major' : "Officer",
    'Master' : 'Master',
    'Miss' : 'Miss',
    'Mlle' : 'Miss',
    'Mme' : 'Mrs',
    'Mr' : 'Mr',
    'Mrs' : 'Mrs',
    'Ms' : 'Mrs',
    'Rev' : 'Officer', 
    'Sir' : 'Royalty',
    'the Countess' : 'Royalty'
}

def get_titles(df):
    df['Title'] =  df['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
    df['Title'] = df.Title.map(Title_Dict)
    return df

In [154]:
train_mod_data = get_titles(train_mod_data)

In [155]:
group_train = train_mod_data.groupby(['Sex', 'Pclass', 'Title'])
group_median_train = group_train.median()
print(group_median_train.shape)
group_median_train = group_median_train.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
print(group_median_train.shape)

(17, 22)
(17, 4)


In [156]:
def fill_age(row):
    condition = (
        (group_median_train['Sex'] == row['Sex']) &
        (group_median_train['Title'] == row['Title']) &
        (group_median_train['Pclass'] == row['Pclass'])
    )
    if np.isnan(group_median_train[condition]['Age'].values[0]):
        print('true')
        condition = (
            (group_median_train['Sex'] == row['Sex']) &
            (group_median_train['Pclass'] == row['Pclass'])
        )
    return group_median_train[condition]['Age'].values[0]

def get_age(df):
    df['Age'] = df.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis = 1)
    return df

In [157]:
train_mod_data = get_age(train_mod_data)

In [158]:
def process_names(df):
    df.drop('Name', axis =1, inplace=True)

    title_dummies = pd.get_dummies(df['Title'], prefix='Title')
    df = pd.concat([df, title_dummies], axis=1)
    df.drop('Title', axis=1, inplace=True)
    return df

In [159]:
train_mod_data = process_names(train_mod_data)

In [160]:
print(train_mod_data)

     PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch  \
0              1         0       3    male  22.0      1      0   
1              2         1       1  female  38.0      1      0   
2              3         1       3  female  26.0      0      0   
3              4         1       1  female  35.0      1      0   
4              5         0       3    male  35.0      0      0   
..           ...       ...     ...     ...   ...    ...    ...   
886          887         0       2    male  27.0      0      0   
887          888         1       1  female  19.0      0      0   
888          889         0       3  female  18.0      1      2   
889          890         1       1    male  26.0      0      0   
890          891         0       3    male  32.0      0      0   

               Ticket     Fare  FamilySize  ...  Cabin_F  Cabin_G  Cabin_T  \
0           A/5 21171   7.2500           2  ...        0        0        0   
1            PC 17599  71.2833           2  ...    

In [161]:
train_mod_data = train_mod_data.assign(sex_class = train_mod_data['Sex'] + "_" + train_mod_data['Pclass'].astype("str"))
train_mod_data["Sex"] = train_mod_data["Sex"].map({"female":0, "male":1})
train_mod_data["sex_class"] = train_mod_data["sex_class"].map({"female_1":0, "female_2":1, "female_3":2, "male_1":4, "male_2":5, "male_3":6})
# train["fsize"] = train["SibSp"] + train["Parch"] + 1
train_mod_data.drop("Ticket", axis=1, inplace=True)
#train.drop("Cabin", axis=1,inplace=True)
#train["Embarked"] = train["Embarked"].map({"S":1, "Q":2, "C":3, "":0})
#train = train.fillna(0)
# train.drop("Embarked", axis=1, inplace=True)
train_mod_data.drop("PassengerId", axis=1, inplace = True)
survived = train_mod_data["Survived"]
# print(survived)
train_mod_data.drop("Survived", axis = 1, inplace= True)
print(train_mod_data)
labels = survived.to_numpy()
data = train_mod_data.to_numpy()

     Pclass  Sex   Age  SibSp  Parch     Fare  FamilySize  Single  \
0         3    1  22.0      1      0   7.2500           2       0   
1         1    0  38.0      1      0  71.2833           2       0   
2         3    0  26.0      0      0   7.9250           1       1   
3         1    0  35.0      1      0  53.1000           2       0   
4         3    1  35.0      0      0   8.0500           1       1   
..      ...  ...   ...    ...    ...      ...         ...     ...   
886       2    1  27.0      0      0  13.0000           1       1   
887       1    0  19.0      0      0  30.0000           1       1   
888       3    0  18.0      1      2  23.4500           4       0   
889       1    1  26.0      0      0  30.0000           1       1   
890       3    1  32.0      0      0   7.7500           1       1   

     SmallFamily  LargeFamily  ...  Cabin_G  Cabin_T  Cabin_U  Title_Master  \
0              1            0  ...        0        0        1             0   
1            

In [5]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [52]:
train = pd.read_csv("train.csv")
train.drop("Name", axis=1, inplace=True)
train.drop("Age", axis=1, inplace=True)
train = train.assign(sex_class = train['Sex'] + "_" + train['Pclass'].astype("str"))
train["Sex"] = train["Sex"].map({"female":0, "male":1})
train["sex_class"] = train["sex_class"].map({"female_1":0, "female_2":1, "female_3":2, "male_1":4, "male_2":5, "male_3":6})
train["fsize"] = train["SibSp"] + train["Parch"] + 1
train.drop("Ticket", axis=1, inplace=True)
train.drop("Cabin", axis=1,inplace=True)
train["Embarked"] = train["Embarked"].map({"S":1, "Q":2, "C":3, "nan":0})
train= train.fillna(0)
# print(train)
# train.drop("Embarked", axis=1, inplace=True)
train.drop("PassengerId", axis=1, inplace = True)
survived = train["Survived"]
# print(survived)
train.drop("Survived", axis = 1, inplace= True)
labels = survived.to_numpy()
# print(labels)
data = train.to_numpy()
print(data)

[[3. 1. 1. ... 1. 6. 2.]
 [1. 0. 1. ... 3. 0. 2.]
 [3. 0. 0. ... 1. 2. 1.]
 ...
 [3. 0. 1. ... 1. 2. 4.]
 [1. 1. 0. ... 3. 4. 1.]
 [3. 1. 0. ... 2. 6. 1.]]


In [49]:
mat = np.matrix(data)
df = pd.DataFrame(data=mat.astype(float))
df.to_csv('outfile.csv', sep = ',', header=False, float_format='%.2f', index=False)

# file_data = open("data.csv", "w")
# for line in mat:
#     np.savetxt(file_data, line, fmt="%.3f")

In [162]:
######Standardize the data
mean = np.zeros((len(data[0, :])), dtype=np.float)
std = np.zeros((len(data[0, :])), dtype=np.float)
data_std = np.zeros((data.shape), dtype=np.float)
print(mean.shape)
# print(data_mean[9])
for i in range(len(data[0, :])):
    mean[i] = np.mean(data[:, i])
    # print("mean = {}".format(mean[i]))
    std[i] = np.std(data[:, i])
    # print("std = {}".format(std[i]))
    data_std[:, i] = (data[:, i] - mean[i]) / std[i]

print(data_std)

(29,)
[[ 0.82737724  0.73769513 -0.5290818  ... -0.11684125 -0.1118034
   0.96189149]
 [-1.56610693 -1.35557354  0.658135   ... -0.11684125 -0.1118034
  -1.78504864]
 [ 0.82737724 -1.35557354 -0.2322776  ... -0.11684125 -0.1118034
  -0.86940193]
 ...
 [ 0.82737724 -1.35557354 -0.825886   ... -0.11684125 -0.1118034
  -0.86940193]
 [-1.56610693  0.73769513 -0.2322776  ... -0.11684125 -0.1118034
   0.04624478]
 [ 0.82737724  0.73769513  0.2129287  ... -0.11684125 -0.1118034
   0.96189149]]


In [113]:
print(data_std.shape)

(891, 30)


In [55]:
mat = np.matrix(data)
df = pd.DataFrame(data=mat.astype(float))
df.to_csv('outfile_std.csv', sep = ',', header=False, float_format='%.2f', index=False)

In [163]:
##### covariance matrix
cov_mat = np.cov(data_std.T)
print(cov_mat)

[[ 1.00112360e+00  1.32048693e-01 -4.15297883e-01  8.31747127e-02
   1.84633934e-02 -5.50117035e-01  6.60710623e-02  1.35359071e-01
  -2.23802470e-01  1.52536803e-01 -2.43565447e-01  2.21257244e-01
   7.41359906e-02 -2.05164721e-01 -3.69987297e-01 -4.17516314e-01
  -2.79003435e-01 -2.30349837e-01  1.10757793e-02  5.56236463e-02
  -5.25550077e-02  7.26356457e-01  8.21736035e-02 -9.26815019e-03
   1.42857956e-01 -1.54610541e-01 -1.13198515e-01 -1.50938238e-01
   4.98510551e-01]
 [ 1.32048693e-01  1.00112360e+00  1.02956253e-01 -1.14759609e-01
  -2.45764790e-01 -1.82537702e-01 -2.01214274e-01  3.03987361e-01
  -2.61040082e-01 -1.03069395e-01 -8.29465631e-02 -7.41983984e-02
   1.19357709e-01  7.83586495e-02 -1.09812319e-01 -5.87152557e-02
  -7.93371764e-02 -4.70553347e-02 -8.21154476e-03 -9.11336922e-02
   2.47553808e-02  1.40548726e-01  1.60114193e-01 -6.92324952e-01
   8.68308566e-01 -5.53306741e-01  8.62900658e-02  1.86701721e-02
   9.26347992e-01]
 [-4.15297883e-01  1.02956253e-01  1.0

In [164]:
#####Eigen decomposition of the covirance matrix
eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
print("Eigen vals = {}".format(eigen_values))
print("Eigen vactors = {}".format(eigen_vectors))

Eigen vals = [ 5.17167594e+00  3.79149226e+00  2.50564842e+00  1.87523924e+00
  1.61260243e+00  1.19432533e-01  8.41703315e-03  2.43698521e-01
  3.97754230e-01  4.48728359e-01  5.48210808e-01  1.29154021e+00
  1.20771459e+00  7.96252326e-01  8.25335028e-01  1.11882885e+00
  1.08431305e+00  9.48133863e-01  9.68214127e-01  1.04825451e+00
  1.02233459e+00  9.94054185e-01  1.00470917e+00  1.55619838e-15
 -1.27809422e-15  2.24694297e-17  1.42557210e-16 -3.11031488e-15
 -3.70220084e-15]
Eigen vactors = [[-1.58975650e-01 -3.60638613e-01 -2.50886140e-01  2.00411795e-02
  -1.52296295e-01  4.45087530e-02  1.51021249e-01 -7.80744135e-01
  -1.39500752e-01  5.70374599e-02 -1.29871116e-02 -2.39419373e-02
  -3.16814027e-02  7.73062185e-02  1.20461335e-02 -3.29209341e-02
  -6.05896107e-03 -4.04006062e-02 -1.09059935e-01  7.77647511e-03
   2.28081727e-02 -2.81222853e-02 -2.49637175e-02  8.74761158e-02
   2.48151008e-01 -5.87689019e-02 -1.46179679e-02 -1.62366254e-03
  -7.05870638e-03]
 [-3.15318736e-01

In [165]:
#### Calculating the explained variance of each component

variance_explained = []
for i in eigen_values:
     variance_explained.append((i/sum(eigen_values))*100)
        
print(variance_explained)

[17.813350318093885, 13.0594377202867, 8.63046981447976, 6.459084807744857, 5.554457056185954, 0.4113741018084882, 0.028991677323383657, 0.8393965850439148, 1.3700269558466731, 1.5456025381896812, 1.8882604556674312, 4.448588509979161, 4.159859078506043, 2.742616083670104, 2.8427887093558675, 3.853700528185495, 3.734814096895485, 3.2657577223217644, 3.334922299523515, 3.6106138507283623, 3.5213351242578046, 3.4239259429361923, 3.46062602296952, 5.360178630743743e-15, -4.402275065111609e-15, 7.739383255884737e-17, 4.910248716319847e-16, -1.0713186427067504e-14, -1.2751881822471154e-14]


In [168]:
#### Identify features that explains most of the data

commlative_variance_explained = np.cumsum(variance_explained)
print(commlative_variance_explained)

[ 17.81335032  30.87278804  39.50325785  45.96234266  51.51679972
  51.92817382  51.9571655   52.79656208  54.16658904  55.71219158
  57.60045203  62.04904054  66.20889962  68.9515157   71.79430441
  75.64800494  79.38281904  82.64857676  85.98349906  89.59411291
  93.11544803  96.53937398 100.         100.         100.
 100.         100.         100.         100.        ]


In [118]:
# Visualizing the eigenvalues and finding the "elbow" in the graphic
sns.lineplot(x = [1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13], y=commlative_variance_explained)
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.title("Explained variance vs Number of components")

ValueError: arrays must all be same length

In [167]:
###  using first 6 components as they explain 100% of the dataset

projection_matrix = (eigen_vectors.T[:][:23]).T
print(projection_matrix)

[[-1.58975650e-01 -3.60638613e-01 -2.50886140e-01  2.00411795e-02
  -1.52296295e-01  4.45087530e-02  1.51021249e-01 -7.80744135e-01
  -1.39500752e-01  5.70374599e-02 -1.29871116e-02 -2.39419373e-02
  -3.16814027e-02  7.73062185e-02  1.20461335e-02 -3.29209341e-02
  -6.05896107e-03 -4.04006062e-02 -1.09059935e-01  7.77647511e-03
   2.28081727e-02 -2.81222853e-02 -2.49637175e-02]
 [-3.15318736e-01 -1.00110849e-01  3.93657570e-01  1.36925994e-01
  -2.98535427e-02 -1.07381032e-02 -5.35164198e-01  1.00088316e-01
   1.53374322e-02 -1.51378507e-02 -4.21425494e-02  6.58901083e-02
   1.02600762e-02  4.86206204e-03  1.47255096e-02 -1.23258373e-02
   3.90565775e-02 -3.62752678e-02  6.36456710e-02 -1.53273069e-02
   1.48715117e-02  6.63875393e-02 -1.49456073e-02]
 [-6.51400055e-02  2.82458035e-01  1.86291328e-01 -1.79562662e-01
  -5.74393213e-03  9.32030645e-03  1.90441525e-03 -2.73345915e-01
   7.07798133e-01 -9.22778324e-02  1.19156631e-04 -3.85402005e-01
   6.59740062e-02  1.41348285e-01 -2.384

In [120]:
#### Getting the product of original std data and projection matrix

data_pca = data_std.dot(projection_matrix)
print(data_pca)

[[ 0.74738634  1.46003659 -0.72511018 ... -0.39071501  0.2636802
   0.03554106]
 [-3.89539371 -3.36047488 -0.63013659 ...  0.86834341 -0.74860486
  -0.96740305]
 [-0.04970347  0.01467653  2.93982598 ...  0.42895151  0.25755798
   0.068796  ]
 ...
 [-2.73919355  1.56780724  1.47541259 ... -1.04485799  0.09610422
  -0.64028178]
 [ 0.49870651 -2.77572297 -1.6744468  ... -0.10118222 -0.47632087
  -1.46310241]
 [ 2.46113535  0.32849012  1.0378158  ... -1.12736719 -0.34666441
  -0.1418607 ]]


In [70]:
mat = np.matrix(data_pca)
df = pd.DataFrame(data=mat.astype(float))
df.to_csv('outfile_pca.csv', sep = ',', header=False, float_format='%.2f', index=False)

In [60]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier

def data_util(data_path):
    df = pd.read_csv(data_path)
    # train = train[:50]
    df = process_family(df)
    df = process_embarked(df)
    df = process_cabin(df)
    df = get_titles(df)
    df = get_age(df)
    df = process_names(df)
    # df = df.assign(sex_class = df['Sex'] + "_" + df['Pclass'].astype("str"))
    # df["Sex"] = df["Sex"].map({"female":0, "male":1})
    # df["sex_class"] = df["sex_class"].map({"female_1":0, "female_2":1, "female_3":2, "male_1":4, "male_2":5, "male_3":6})
    sex_dummies = pd.get_dummies(df['Sex'], prefix='Sex')
    df = pd.concat([df, sex_dummies], axis=1)
    df.drop('Sex', axis=1, inplace=True)
    df.drop("Ticket", axis=1, inplace=True)
    # df.drop("PassengerId", axis=1, inplace = True)
    return df
    # survived = df["Survived"]
    # df.drop("Survived", axis = 1, inplace= True)

    # train = train[:60]
    # print(train.shape)

    # labels = survived.to_numpy()
    # data = train.to_numpy()
    # data = self.get_data_pca(data)
    # data = data[:60]
    # print(data[:5])
    # print(data.shape)


def get_data(data_path, split):
    df = data_util(data_path)
    if split == 'train':
        survived = df["Survived"]
        df.drop("Survived", axis = 1, inplace= True)
        labels = survived.to_numpy()
        print(df.columns)
        # data = df.to_numpy()
        # data = get_data_pca(data)
        data = df[60:]
        labels = survived[60:]
        print(data.shape)
        # print(data[:5])
    elif split == 'val' :
        survived = df["Survived"]
        df.drop("Survived", axis = 1, inplace= True)
        labels = survived.to_numpy()
        # data = df.to_numpy()
        # data = get_data_pca(data)
        data = df[:60]
        labels = survived[:60]
        print(data.shape)
        # print(data[:5])

    elif(split == 'test'): 
        # print(df.isnull().sum())
        # print(df.loc[df['Fare'].isnull()].index)
        df['Fare'].fillna(0, inplace=True)
        df['Cabin_T'] = 0
        # print(df.isnull().sum())
        print(df.columns)
        data = df
        # data = get_data_pca(data)
        labels = np.zeros((len(data[:,0])))
        # data = self.get_data_pca(data)      
    
    # print(len(data[:, 0]))
    # print(data[190], labels[190])
    # print(labels.shape, data.shape)
    return data, labels

def process_family(df):
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    df['Single'] = df['FamilySize'].map(lambda s:1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s:1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s:1 if s >= 5 else 0)
    return df

def process_embarked(df):
    df.Embarked.fillna('S', inplace=True)
    # df['Embarked'] = df['Embarked'].map(lambda c:'S' if c == "" else c)
    df_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, df_dummies], axis=1)
    df.drop('Embarked', axis = 1, inplace=True)
    return df

def process_cabin(df):
    # df.Cabin = df.Cabin.fillna('U')
    df['Cabin'].fillna('U', inplace=True)
    # df['Cabin'] = df['Cabin'].replace(np.nan , 'U')
    # df['Cabin'] = df['Cabin'].map(lambda c:'U' if c == "" else c)
    # print(df['Cabin'])
    df['Cabin'] = df['Cabin'].map(lambda c : c[0])
    df_dummies = pd.get_dummies(df['Cabin'], prefix='Cabin')
    df = pd.concat([df, df_dummies], axis=1)
    df.drop('Cabin', axis=1, inplace=True)
    return df

def get_titles(df):
    Title_Dict = {
        'Capt':'Officer',
        'Col' : 'Officer',
        'Don' : 'Officer',
        'Dr' : 'Royalty', 
        'Jonkheer': 'Royalty',
        'Lady': 'Royalty',
        'Major' : "Officer",
        'Master' : 'Master',
        'Miss' : 'Miss',
        'Mlle' : 'Miss',
        'Mme' : 'Mrs',
        'Mr' : 'Mr',
        'Mrs' : 'Mrs',
        'Ms' : 'Mrs',
        'Rev' : 'Officer', 
        'Sir' : 'Royalty',
        'the Countess' : 'Royalty'
    }
    df['Title'] =  df['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
    df['Title'] = df.Title.map(Title_Dict)
    return df

def fill_age(row, group_median_train):
    condition = (
        (group_median_train['Sex'] == row['Sex']) &
        (group_median_train['Title'] == row['Title']) &
        (group_median_train['Pclass'] == row['Pclass'])
    )
    if np.isnan(group_median_train[condition]['Age'].values[0]):
        print('true')
        condition = (
            (group_median_train['Sex'] == row['Sex']) &
            (group_median_train['Pclass'] == row['Pclass'])
        )
    return group_median_train[condition]['Age'].values[0]

def get_age(df):
    group_train = df.groupby(['Sex', 'Pclass', 'Title'])
    group_median_train = group_train.median()
    # print(group_median_train.columns)
    group_median_train = group_median_train.reset_index()[['Sex', 'Pclass', 'Title','Age']]
    # print(group_median_train)
    df['Age'] = df.apply(lambda row: fill_age(row,group_median_train) if np.isnan(row['Age']) else row['Age'], axis = 1)
    return df

def process_names(df):
    df.drop('Name', axis =1, inplace=True)

    title_dummies = pd.get_dummies(df['Title'], prefix='Title')
    df = pd.concat([df, title_dummies], axis=1)
    df.drop('Title', axis=1, inplace=True)
    return df

def get_data_pca(data):
    ######Standardize the data
    mean = np.zeros((len(data[0, :])), dtype=np.float)
    std = np.zeros((len(data[0, :])), dtype=np.float)
    data_std = np.zeros((data.shape), dtype=np.float)
    for i in range(len(data[0, :])):
        mean[i] = np.mean(data[:, i])
        # print("mean = {}".format(mean[i]))
        std[i] = np.std(data[:, i])
        # print("std = {}".format(std[i]))
        data_std[:, i] = (data[:, i] - mean[i]) / std[i]
    # print(data_std)
    
    ##### covariance matrix
    cov_mat = np.cov(data_std.T)
    # print(cov_mat)

    #####Eigen decomposition of the covirance matrix
    eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
    # print("Eigen vals = {}".format(eigen_values))
    # print("Eigen vactors = {}".format(eigen_vectors))

    #### Calculating the explained variance of each component
    variance_explained = []
    for i in eigen_values:
        variance_explained.append((i/sum(eigen_values))*100)
    # print(variance_explained)
    
    ###  using first 23 components as they explain 100% of the dataset
    projection_matrix = (eigen_vectors.T[:][:22]).T
    # print(projection_matrix)

    #### Getting the product of original std data and projection matrix
    data_pca = data_std.dot(projection_matrix)
    # print(data_pca)
    return data_pca



In [61]:
X_train, Y_train = get_data("train.csv", "train")
X_val, Y_val = get_data("train.csv", "val")
print(X_train.shape, Y_train.shape, X_val.shape, Y_val.shape)
model = RandomForestClassifier(n_estimators=180, min_samples_leaf=3, max_features=0.5, n_jobs=-1)


Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
       'Single', 'SmallFamily', 'LargeFamily', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty',
       'Sex_female', 'Sex_male'],
      dtype='object')
(831, 30)
(60, 30)
(831, 30) (831,) (60, 30) (60,)


In [62]:
model.fit(X_train, Y_train)


RandomForestClassifier(max_features=0.5, min_samples_leaf=3, n_estimators=180,
                       n_jobs=-1)

In [65]:
model.score(X_train, Y_train)

0.9205776173285198

In [66]:
y_predict = model.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(Y_val, y_predict)

0.8166666666666667

In [67]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_val, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84        33
           1       0.81      0.78      0.79        27

    accuracy                           0.82        60
   macro avg       0.82      0.81      0.81        60
weighted avg       0.82      0.82      0.82        60



In [68]:
print(confusion_matrix(Y_val, y_predict))

[[28  5]
 [ 6 21]]


In [69]:
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.11043
Feature: 1, Score: 0.08323
Feature: 2, Score: 0.09912
Feature: 3, Score: 0.01686
Feature: 4, Score: 0.00784
Feature: 5, Score: 0.13611
Feature: 6, Score: 0.03350
Feature: 7, Score: 0.00331
Feature: 8, Score: 0.01052
Feature: 9, Score: 0.02373
Feature: 10, Score: 0.00652
Feature: 11, Score: 0.00375
Feature: 12, Score: 0.00698
Feature: 13, Score: 0.00091
Feature: 14, Score: 0.00171
Feature: 15, Score: 0.00335
Feature: 16, Score: 0.00180
Feature: 17, Score: 0.00668
Feature: 18, Score: 0.00010
Feature: 19, Score: 0.00011
Feature: 20, Score: 0.00000
Feature: 21, Score: 0.01932
Feature: 22, Score: 0.01286
Feature: 23, Score: 0.00695
Feature: 24, Score: 0.17960
Feature: 25, Score: 0.01027
Feature: 26, Score: 0.00850
Feature: 27, Score: 0.00139
Feature: 28, Score: 0.09682
Feature: 29, Score: 0.10770


In [52]:
X_test, Y_test = get_data("test.csv", "test")
print(X_test.shape)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
       'Single', 'SmallFamily', 'LargeFamily', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_U', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Sex_female',
       'Sex_male', 'Cabin_T'],
      dtype='object')
(418, 30)


In [53]:
model.predict(X_test)

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,