In [1]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('C:/Users/Lenovo/Downloads/train.csv').copy()
df_test = pd.read_csv('C:/Users/Lenovo/Downloads/test.csv').copy()

In [3]:
df_train.shape

(891, 12)

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def sx(s):
    if s=='male':
        return 1
    else:
        return 0

In [6]:
df_train['Sex'] = df_train['Sex'].apply(sx)
df_test['Sex'] = df_test['Sex'].apply(sx)

In [7]:
#Replacing the embarked Nan values in training data
Embarked_Nan = df_train[(df_train.Ticket.str.startswith('113'))]['Embarked'].mode()[0]
df_train.Embarked.fillna(Embarked_Nan,inplace=True)

In [8]:
#Replacing Nans in age column with the median of group having the same sex and PassengerClass
df_train['Age'] = df_train['Age'].fillna(df_train.groupby(['Sex', 'Pclass'])['Age'].transform('mean'))
df_test['Age'] = df_test['Age'].fillna(df_test.groupby(['Sex', 'Pclass'])['Age'].transform('mean'))

In [9]:
# Replacing null value of fare in test_data
df_test['Fare'] = df_test['Fare'].fillna(df_test.groupby('Pclass')['Fare'].transform('mean'))

In [10]:
#Splitting the embarked column into 3 columns one for each of the ports
def three_ports(df):
    ports  = df.Embarked.unique()
    for i in ports:
        df[i] = (df['Embarked']==i).astype('int64')
    return df

In [11]:
df_train = three_ports(df_train)
df_test = three_ports(df_test)

In [12]:
# Adding a column containing the numerical part of the tickets
df_train.Ticket.replace({'LINE':'0000'},inplace=True)
df_train['n_tic'] = df_train['Ticket'].apply(lambda x : int(x.split()[-1]))
df_test['n_tic'] = df_test['Ticket'].apply(lambda x : int(x.split()[-1]))

In [13]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Q,S,C,n_tic
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,1,0,0,330911
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S,0,1,0,363272
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,1,0,0,240276
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,0,1,0,315154
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,0,1,0,3101298


In [32]:
train_df = df_train.select_dtypes(include=np.number).copy()
test_df = df_test.select_dtypes(include=np.number).copy()
y_train = train_df.Survived.values
del train_df['Survived']
train_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,S,C,Q,n_tic
0,1,3,1,22.00,1,0,7.2500,1,0,0,21171
1,2,1,0,38.00,1,0,71.2833,0,1,0,17599
2,3,3,0,26.00,0,0,7.9250,1,0,0,3101282
3,4,1,0,35.00,1,0,53.1000,1,0,0,113803
4,5,3,1,35.00,0,0,8.0500,1,0,0,373450
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,27.00,0,0,13.0000,1,0,0,211536
887,888,1,0,19.00,0,0,30.0000,1,0,0,112053
888,889,3,0,21.75,1,2,23.4500,1,0,0,6607
889,890,1,1,26.00,0,0,30.0000,0,1,0,111369


In [24]:
#Adding Quadratic Feaetures
columns = train_df.columns[1:]
n = len(columns)
for i in range(n):
    for j in range(i,n):
        train_df[columns[i]+'*'+columns[j]] = train_df[columns[i]]*train_df[columns[j]]
        test_df[columns[i]+'*'+columns[j]] = test_df[columns[i]]*test_df[columns[j]]

In [25]:
train_df.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'S',
       'C', 'Q', 'n_tic', 'Pclass*Pclass', 'Pclass*Sex', 'Pclass*Age',
       'Pclass*SibSp', 'Pclass*Parch', 'Pclass*Fare', 'Pclass*S', 'Pclass*C',
       'Pclass*Q', 'Pclass*n_tic', 'Sex*Sex', 'Sex*Age', 'Sex*SibSp',
       'Sex*Parch', 'Sex*Fare', 'Sex*S', 'Sex*C', 'Sex*Q', 'Sex*n_tic',
       'Age*Age', 'Age*SibSp', 'Age*Parch', 'Age*Fare', 'Age*S', 'Age*C',
       'Age*Q', 'Age*n_tic', 'SibSp*SibSp', 'SibSp*Parch', 'SibSp*Fare',
       'SibSp*S', 'SibSp*C', 'SibSp*Q', 'SibSp*n_tic', 'Parch*Parch',
       'Parch*Fare', 'Parch*S', 'Parch*C', 'Parch*Q', 'Parch*n_tic',
       'Fare*Fare', 'Fare*S', 'Fare*C', 'Fare*Q', 'Fare*n_tic', 'S*S', 'S*C',
       'S*Q', 'S*n_tic', 'C*C', 'C*Q', 'C*n_tic', 'Q*Q', 'Q*n_tic',
       'n_tic*n_tic'],
      dtype='object')

In [33]:
x_test = test_df.iloc[:,1:]
x_train = train_df.iloc[:,1:]

In [34]:
x_train.shape,x_test.shape

((891, 10), (418, 10))

In [50]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
s_x_train = scaler.fit_transform(x_train)
s_x_test = scaler.transform(x_test)

In [37]:
#Removing features through backward elemination.
clf = GaussianNB()
n = x_train.shape[1]
print(n)
d = []
abc = GridSearchCV(clf,{},n_jobs=3)
abc.fit(s_x_train,y_train)
so = abc.best_score_
for i in range(n):
    
    x_train_without_i = np.delete(s_x_train,i,axis=1)
    
    abc1 = GridSearchCV(clf,{},n_jobs=3)
    abc1.fit(x_train_without_i,y_train)
    s1 = abc1.best_score_
#     print('s1:',s1,'  so:',so)
    if s1-so>=0.001:
        d.append(i)
        print('Removing',i,'th feature     , Increase in score:',s1-so)

10
Removing 7 th feature     , Increase in score: 0.006691356474797572
Removing 8 th feature     , Increase in score: 0.013401544159186596


In [38]:
d

[7, 8]

In [51]:
s_x_train = np.delete(s_x_train,d,1)
s_x_test = np.delete(s_x_test,d,1)

In [52]:
s_x_train.shape

(891, 8)

In [54]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
# grid = {'n_neighbors':[3,5,7,9],'p':[2,1]}
abc = GridSearchCV(clf,{},n_jobs=3,verbose=4)
abc.fit(s_x_train,y_train)
abc.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 out of   5 | elapsed:    4.2s remaining:    6.4s
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    4.3s finished


0.7789467076768564

In [119]:
abc.best_estimator_

KNeighborsClassifier(n_neighbors=7, p=1)

In [42]:
y_pred = abc.predict(s_x_test)

In [43]:
df_pred = pd.DataFrame({'PassengerID':df_test['PassengerId'],'Survived':y_pred})
df_pred.to_csv('pred.csv',index=False)