

# Data Mining: Classification. Measuring Accuracy Using Cross-Validation
---

## I. Data preparation
Read the Titanic data. And prepare the data for modeling.

In [33]:
import pandas as pd
import numpy as np
df = pd.read_csv('titanic_train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### 1. We remove unnecessary variables

In [34]:
df.pop('Cabin')
df.pop('Name')
df.pop('Ticket')
df.pop('PassengerId')
df.pop('Fare')

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

#### 2. Changing gender to binary

In [35]:
# map
df['Gender'] = df['Gender'].map( { 'female': 0, 'male': 1} ).astype(int)


In [36]:
print(df.isnull().sum())

Survived      0
Pclass        0
Gender        0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64


#### 3. Converting the Embarked to Dummies

In [37]:
df['Age'] = df['Age'].fillna(value = df['Age'].mean())

In [39]:
print(df.isnull().sum())
df

Survived    0
Pclass      0
Gender      0
Age         0
SibSp       0
Parch       0
Embarked    2
dtype: int64


Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,Embarked
0,0,3,1,22.000000,1,0,S
1,1,1,0,38.000000,1,0,C
2,1,3,0,26.000000,0,0,S
3,1,1,0,35.000000,1,0,S
4,0,3,1,35.000000,0,0,S
...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,S
887,1,1,0,19.000000,0,0,S
888,0,3,0,29.699118,1,2,S
889,1,1,1,26.000000,0,0,C


In [40]:
 df.dropna(axis=0)

Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,Embarked
0,0,3,1,22.000000,1,0,S
1,1,1,0,38.000000,1,0,C
2,1,3,0,26.000000,0,0,S
3,1,1,0,35.000000,1,0,S
4,0,3,1,35.000000,0,0,S
...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,S
887,1,1,0,19.000000,0,0,S
888,0,3,0,29.699118,1,2,S
889,1,1,1,26.000000,0,0,C


In [45]:
df = pd.concat([df, pd.get_dummies(df['Embarked'])],axis=1)
df.pop('Embarked')

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

#### 4. Working with missing values

In [46]:
# fillna



#### 5. Divide data into X and y (independent and dependent variables)

In [47]:
y = df['Survived']
x=df[['Pclass','Gender','Age','SibSp','Parch','C','Q','S']]

In [48]:
df

Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,C,Q,S
0,0,3,1,22.000000,1,0,0,0,1
1,1,1,0,38.000000,1,0,1,0,0
2,1,3,0,26.000000,0,0,0,0,1
3,1,1,0,35.000000,1,0,0,0,1
4,0,3,1,35.000000,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,0,0,1
887,1,1,0,19.000000,0,0,0,0,1
888,0,3,0,29.699118,1,2,0,0,1
889,1,1,1,26.000000,0,0,1,0,0


In [49]:
x

Unnamed: 0,Pclass,Gender,Age,SibSp,Parch,C,Q,S
0,3,1,22.000000,1,0,0,0,1
1,1,0,38.000000,1,0,1,0,0
2,3,0,26.000000,0,0,0,0,1
3,1,0,35.000000,1,0,0,0,1
4,3,1,35.000000,0,0,0,0,1
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,0,0,1
887,1,0,19.000000,0,0,0,0,1
888,3,0,29.699118,1,2,0,0,1
889,1,1,26.000000,0,0,1,0,0


#### 6. Divide data to train and test (using the train_test_split).

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

## II. Using the Random Forest Classifier for prediction
Load and train the Random Forest Classifier model on `X_train` and `y_train`.

Show the score on train data.

Show the score on test data.

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
RandomForest = RandomForestClassifier(n_estimators = 400)

In [54]:
RandomForest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=400)

In [55]:
RandomForest.score(x_train, y_train)

0.9505988023952096

In [56]:
RandomForest.score(x_test, y_test)

0.7668161434977578

## III. Measuring the Accurasy Using Cross Validation

In [57]:
from sklearn.model_selection import KFold

In [58]:
kf = KFold(n_splits=7, shuffle=True)

In [59]:
splits = list(kf.split(x))

In [60]:
splits

[(array([  0,   2,   4,   5,   6,   7,   8,  10,  11,  12,  13,  15,  16,
          18,  20,  21,  22,  23,  25,  27,  28,  29,  30,  31,  33,  34,
          35,  37,  38,  39,  40,  42,  43,  44,  45,  47,  48,  50,  51,
          52,  53,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
          66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
          79,  81,  82,  83,  84,  85,  86,  87,  88,  89,  91,  93,  94,
          95,  96,  97,  99, 100, 101, 103, 104, 106, 107, 108, 109, 110,
         111, 112, 113, 114, 115, 117, 118, 120, 122, 124, 125, 126, 128,
         129, 130, 131, 132, 134, 135, 136, 138, 139, 140, 141, 142, 143,
         144, 145, 146, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158,
         159, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 173, 174,
         175, 176, 178, 179, 180, 181, 182, 184, 185, 186, 187, 188, 189,
         190, 191, 192, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204,
         205, 206, 207, 208, 209, 210,

In [61]:
rfc = RandomForestClassifier()

In [62]:
scores = []
n = 1
for train_indices, test_indices in splits:
    x_train = x.values[train_indices]
    x_test = x.values[test_indices]
    y_train = y.values[train_indices]
    y_test = y.values[test_indices]
    
    rfc.fit(x_train, y_train)
    scores.append(rfc.score(x_test, y_test))
    print(f"{n} models Score on test data: {rfc.score(x_test, y_test)}")
    n += 1

1 models Score on test data: 0.8515625
2 models Score on test data: 0.8359375
3 models Score on test data: 0.7480314960629921
4 models Score on test data: 0.8188976377952756
5 models Score on test data: 0.7244094488188977
6 models Score on test data: 0.8110236220472441
7 models Score on test data: 0.7795275590551181


#### Calculate the average score on Test Data

In [63]:
sum=0
for i in scores:
    sum=sum+i

In [64]:
sum/7

0.7956271091113611

#### Calculate the average score automatically

In [65]:
from sklearn.model_selection import cross_val_score

In [66]:
scores = cross_val_score(rfc, x, y, cv=7)

In [67]:
scores.mean()

0.8003110939257593