# A Famous Classification Task (Hands-On !)

Its time for you to build your first Classification model and run it on Titanic Survival prediction problem.

You have to load train and test sets and see the relevant details of the features yourself using pandas:

Once you have made your model and are ready with your predictions save it to a csv file, upload it on KAGGLE. See what you get.

In [109]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [110]:
# Load the Titanic data set '../data/train.csv' and '../data/test.csv' into separate dataframes and view the head of dataframe
df = pd.read_csv('../Data/train.csv')
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,0,22.0,1,0,7.25,1.0,0
1,1,1,38.0,1,0,71.2833,0.0,1
2,3,1,26.0,0,0,7.925,1.0,1
3,1,1,35.0,1,0,53.1,1.0,1
4,3,0,35.0,0,0,8.05,1.0,0


In [111]:
# printing missing values in dataset
print(df.isnull().sum())

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64


In [112]:
# Replacing missing values with mean
df.fillna(df.mean(), inplace=True);

In [113]:
# printing missing values in dataset
print(df.isnull().sum())

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64


In [114]:
# Load the features to a variable X
X = df.drop(['Survived'], axis = 1)
# Load the dependent variable to y
y = df['Survived']

In [115]:
# View the head of X and y
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,1.0
1,1,1,38.0,1,0,71.2833,0.0
2,3,1,26.0,0,0,7.925,1.0
3,1,1,35.0,1,0,53.1,1.0
4,3,0,35.0,0,0,8.05,1.0


In [116]:
# Split the dataset into train and test sets where test set size should be 0.3 and random_state=1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [117]:
# IMPORTING VARIOUS CLASSIFIERS
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression

### Import a Classifier of your own choice from the list below !
1. LinearSVC()
2. MLPClassifier()
3. KNeighborsClassifier()
4. SVC()
5. DecisionTreeClassifier()
6. RandomForestClassifier()
7. ExtraTreeClassifier()
8. LogisticRegression()

In [118]:
# Create an instance for the classifier
Classifier = DecisionTreeClassifier()

In [119]:
# Train the model on our X-train dataframe
Classifier.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [120]:
# Get predictions from the model using your testing set from the train.csv
predictions = Classifier.predict(X_test)

In [121]:
# Print the Accuracy score for your model, dont forget to import mertrics from sklearn library
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.75      0.81      0.78       153
          1       0.72      0.64      0.68       115

avg / total       0.74      0.74      0.74       268



In [122]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.7388059701492538

# Submit results on Kaggle

In [123]:
X_test_Kaggle = pd.read_csv('../Data/test.csv')

In [124]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
114,3,1,17.0,0,0,14.4583,0.0
874,2,1,28.0,1,0,24.0,0.0
76,3,0,29.699118,0,0,7.8958,1.0
876,3,0,20.0,0,0,9.8458,1.0
674,2,0,29.699118,0,0,0.0,1.0


In [None]:
# Kaggle Submissions

In [125]:
X_test_Kaggle.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.5,0,0,7.8292,2
1,3,1,47.0,1,0,7.0,1
2,2,0,62.0,0,0,9.6875,2
3,3,0,27.0,0,0,8.6625,1
4,3,1,22.0,1,1,12.2875,1


In [126]:
# printing missing values in dataset
print(X_test_Kaggle.isnull().sum())

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [106]:
X_test_Kaggle.fillna(X_test_Kaggle.mean(), inplace=True);

In [107]:
# printing missing values in dataset
print(X_test_Kaggle.isnull().sum())

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [108]:
kaggle_predictions = Classifier.predict(X_test_Kaggle)

ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 7 

In [42]:
X_test_ids = pd.read_csv('../Data/testOriginal.csv')

In [43]:
results = pd.DataFrame({
    "PassengerId": X_test_ids['PassengerId'],
    "Survived": kaggle_predictions})

In [44]:
results.to_csv('Your_submission.csv', index=False)

## Congratulations on successfully building your first Classification Model !!!