<font color = green >

# Home task

</font>

## Titanic - Machine Learning from Disaster

Predict survival on the [Titanic](https://www.kaggle.com/c/titanic)
<hr/>


# Solution:


In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Init data

In [2]:
train_df = pd.read_csv('titanic/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df = pd.read_csv('titanic/test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Clear and fill NaN fields


In [4]:
train_df.isna().any()  # Find NaN fields

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [5]:
test_df.isna().any()  # Find NaN fields

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [6]:
def clear(data):
    data.drop(columns=['Ticket', 'Cabin', 'Name'], inplace=True)

    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
    data['Age'] = data['Age'].fillna(value=data['Age'].median())
    data['Age'] = data['Age'].astype(int)
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    data['Embarked'].fillna(value=data['Embarked'].mode()[0], inplace=True)
    data['Embarked'] = data['Embarked'].map({'S': 1, 'Q': 2, 'C': 3})

    return data


train_df = clear(train_df)
test_df = clear(test_df)

In [7]:
train_df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
430,431,1,1,0,28,0,0,26.55,1
346,347,1,2,1,40,0,0,13.0,1
321,322,0,3,0,27,0,0,7.8958,1
197,198,0,3,0,42,0,1,8.4042,1
648,649,0,3,0,28,0,0,7.55,1
277,278,0,2,0,28,0,0,0.0,1
260,261,0,3,0,28,0,0,7.75,2
761,762,0,3,0,41,0,0,7.125,1
12,13,0,3,0,20,0,0,8.05,1
721,722,0,3,0,17,1,0,7.0542,1


In [8]:
test_df.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
305,1197,1,1,64,1,1,26.55,1
223,1115,3,0,21,0,0,7.7958,1
139,1031,3,0,40,1,6,46.9,1
391,1283,1,1,51,0,1,39.4,1
405,1297,2,0,20,0,0,13.8625,3
12,904,1,1,23,1,0,82.2667,1
195,1087,3,0,33,0,0,7.8542,1
360,1252,3,0,14,8,2,69.55,1
341,1233,3,0,32,0,0,7.5792,1
66,958,3,1,18,0,0,7.8792,2


### Try different classifiers

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_df[features]
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

In [10]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=47,
                             max_depth=3,
                             ).fit(X_train, y_train)

prediction = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, prediction)}')

Accuracy: 0.820627802690583


In [11]:
from sklearn.svm import SVC

clf_svc = SVC(kernel="linear",
              random_state=47,
              ).fit(X_train, y_train)

pred_svc = clf_svc.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, pred_svc)}')

Accuracy: 0.8116591928251121


In [12]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(random_state=47,
                                ).fit(X_train, y_train)
pred_rf = clf_rf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, pred_rf)}')

Accuracy: 0.7937219730941704


In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf_rf_scaler = RandomForestClassifier(random_state=47
                                       ).fit(X_train_scaled, y_train)
pred_rf_scaler = clf_rf_scaler.predict(X_test_scaled)
print(f'Accuracy: {accuracy_score(y_test, pred_rf_scaler)}')

Accuracy: 0.7937219730941704


In [14]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=47
                    ).fit(X_train, y_train)
predicted_xgb = xgb.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, predicted_xgb)}')

Accuracy: 0.7982062780269058


In [15]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           scoring='accuracy'
                           ).fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

Best hyperparameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best accuracy: 0.8383458646616541


In [16]:
predicted = grid_search.predict(test_df.drop('PassengerId', axis=1))
predicted

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [17]:
data1 = {'PassengerId': test_df.index, 'Survived': predicted}
output_df = pd.DataFrame(data=data1)
output_df.set_index('PassengerId')

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [18]:
output_df.to_csv('Submission.csv', index=False)