In [0]:
!kaggle competitions download -c titanic -p /content # Importing titanic dataset from Kaggle

In [0]:
import pandas as pd

# Loading datasets as dataframe
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.shape

(891, 12)

In [0]:
import numpy as np

In [7]:
# 'Embarked' column has 3 unique values which denotes the destination the passenger intended to reach.
# So, one hot encoding can be applied to extract info from this column

train_mod = pd.get_dummies(train, prefix_sep='_', columns=['Embarked', 'Sex'])
train_mod.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,1,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,0,1,0,1


In [8]:
# Other string variables cannot be One-Hot-Encoded, so it is safe to drop them

train_mod.drop(['Ticket', 'Cabin', 'Name'], axis = 1, inplace=True)
train_mod.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,22.0,1,0,7.25,0,0,1,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,1,3,26.0,0,0,7.925,0,0,1,1,0
3,4,1,1,35.0,1,0,53.1,0,0,1,1,0
4,5,0,3,35.0,0,0,8.05,0,0,1,0,1


In [9]:
# Calculating missing values
train_mod.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Sex_female       0
Sex_male         0
dtype: int64

We can see that 'Age' column has a lot of missing values. We will use SimpleImputer from sklearn to address this issue.

In [0]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
train_imp = imputer.fit_transform(train_mod) # Imputing the missing values i.e., 'Age' column.

In [0]:
train_imp = pd.DataFrame(train_imp, columns=train_mod.columns) # Imputing returns a numpy array and it can be change to pandas DataFrame with this line of code

In [27]:
train_imp.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Sex_female     0
Sex_male       0
Name_len       0
dtype: int64

In [28]:
train_imp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Name_len
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,0.0,0.0,1.0,0.0,1.0,23.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,51.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,0.0,0.0,1.0,1.0,0.0,22.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,0.0,0.0,1.0,1.0,0.0,44.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,0.0,0.0,1.0,0.0,1.0,24.0


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [0]:
# Segregating the training data as features and targets

y = train_imp['Survived']
X = train_imp.drop(['Survived','PassengerId', 'Name_len'], axis = 1)

In [0]:
# Build a SVM classifier and return accuracy score of the model

def svm(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  
  svc = SVC(kernel='rbf', gamma=1/X_train.shape[1], random_state=42)
  clf = svc.fit(X_train, y_train)

  y_preds = clf.predict(X_test)
  
  return(accuracy_score(y_preds, y_test))

In [0]:
# Build a Random Forest Classifier and return accuracy score of the model and feature_importances

def rfc(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  rfc = RandomForestClassifier(n_estimators = 50, max_depth = 10, random_state = 0)
  rfc.fit(X_train, y_train)

  y_preds_rfc = rfc.predict(X_test)
  features = rfc.feature_importances_
  
  return(accuracy_score(y_preds_rfc, y_test), features)

In [46]:
svm(X,y)

0.7430167597765364

In [47]:
rfc_clf = rfc(X, y)
print(rfc_clf[0])

0.7988826815642458


In [0]:
imp_features = np.where(rfc_clf[1] > 0.1) # Selecting feature importances greater than 0.1. Higher the value, more the importance of that feature
# Here it rendered 5 features
X_mod = X[X.columns[imp_features]] # Creating a new dataframe only with important features

In [36]:
print(X.shape, X_mod.shape)

(891, 10) (891, 5)


In [23]:
print(rfc(X_mod, y)[0])

0.8100558659217877


In [48]:
svm(X_mod, y)

0.5977653631284916

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [51]:
svm(X_scaled, y) # Accuracy after scaling

0.8547486033519553

In [52]:
print(rfc(X_scaled, y)[0])

0.8715083798882681


In [69]:
X_mod_scaled = scaler.fit_transform(X_mod)

print("Accuracy of SVM after scaling and feature selection: {:.2f} \nAccuracy of RFC after scaling and feature selection: {:.2f}"
      .format(svm(X_mod_scaled, y), rfc(X_mod_scaled, y)[0]))

Accuracy of SVM after scaling and feature selection: 0.76 
Accuracy of RFC after scaling and feature selection: 0.83


**Summary**:

Accuracy of SVM before scaling and feature selection: 0.74

Accuracy of SVM after feature selection: 0.60

Accuracy of SVM after scaling: 0.85


---


Accuracy of RFC before scaling: 0.80

Accuracy of RFC after scaling: 0.87


Accuracy of RFC after feature selection: 0.81

---

However, the accuracy of both the models dropped when both scaling and feature selection are employed: RFC - 0.83 & SVM - 0.76 

Even though those are decent accuracy scores, mere scaling rendered the highest accuracy of all, in both the models.