In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

In this notebook I would like the answer the question-
“what sorts of people were more likely to survive?”
using some classification models that predicts which passengers survived the Titanic shipwreck.

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# **Importing the datasets**

## **Train data**

In [3]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The train data contain information about 891 passengers (rows) and 12 columns.

In [4]:
train_data.shape

(891, 12)

## **Test data**

In [5]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


The test data contain information about 418 passengers (rows) and 11 columns.

In [6]:
test_data.shape

(418, 11)

# **Features Engineering & Data Analysis**

#### For Features Engineering let's combine the train set and the test set.

In [7]:
all_data = train_data.append(test_data)

In [8]:
all_data.shape

(1309, 12)

#### **The type of each column in the data frame:**

In [9]:
all_data.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

* #### **Checking whether there are missing data**

In [10]:
all_data.columns[all_data.isna().any()].tolist()

['Survived', 'Age', 'Fare', 'Cabin', 'Embarked']

It's obvious why there are missing values in the Survived column...

I can't see how the information about the Cabin can help, so I won't fill up the empty cells.

For the rest columns with missing values, let's do some manipulations to fill up the empty cells.

* #### **Age**
To imputing the age more precisely, let's define a new temp column- 'Title' (the title of each passenger). 

In [11]:
all_data['Title'] = all_data['Name']

# Cleaning name and extracting Title
for name_string in all_data['Name']:
    all_data['Title'] = all_data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

all_data.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [12]:
# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}

all_data.replace({'Title': mapping}, inplace=True)
titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']

for title in titles:
    age_to_impute = all_data.groupby('Title')['Age'].median()[titles.index(title)]
    all_data.loc[(all_data['Age'].isnull()) & (all_data['Title'] == title), 'Age'] = age_to_impute
    
# Substituting Age values in TRAIN_DF and TEST_DF:
train_data['Age'] = all_data['Age'][:891]
test_data['Age'] = all_data['Age'][891:]

# Dropping Title feature
all_data.drop('Title', axis = 1, inplace = True)

We will create a new column, AgeGroup, that will contain the age groups of the passengers according the following partition:
- Infant: 0-3
- Kid: 4-12
- Teen: 13-19
- Adult: 20-110

After that, we will encode this column to a new one - AgeGroup_code.

In [13]:
bins= [0,4,13,20,110]
labels = ['Infant','Kid','Teen','Adult']
all_data['AgeGroup'] = pd.cut(train_data['Age'], bins=bins, labels=labels, right=False)

label = LabelEncoder()

all_data['AgeGroup_code'] = label.fit_transform(all_data['AgeGroup'])

train_data['AgeGroup_code'] = all_data['AgeGroup_code'][:891]
test_data['AgeGroup_code'] = all_data['AgeGroup_code'][891:]

In [14]:
all_data.AgeGroup.unique()

['Adult', 'Infant', 'Teen', 'Kid']
Categories (4, object): ['Infant' < 'Kid' < 'Teen' < 'Adult']

In [15]:
all_data.AgeGroup_code.unique()

array([0, 1, 3, 2])

- 0 : Adult
- 1 : Infant
- 2 : Kid
- 3 : Teen

In [16]:
all_data[['AgeGroup', 'AgeGroup_code']].head(10)

Unnamed: 0,AgeGroup,AgeGroup_code
0,Adult,0
1,Adult,0
2,Adult,0
3,Adult,0
4,Adult,0
5,Adult,0
6,Adult,0
7,Infant,1
8,Adult,0
9,Teen,3


* #### **Embarked**
Let's fill up the missing values cells in the Embarked column with the most common value.

In [17]:
print('the most common value in the Embarked column: ', all_data.Embarked.value_counts().idxmax())

the most common value in the Embarked column:  S


In [18]:
all_data.Embarked.fillna('S', inplace = True)

* #### **Fare**
Filling up the missing values with the median of the Fare column

In [19]:
all_data.Fare.fillna(all_data.Fare.median(), inplace = True)

Encoding the Fare column into a new column - FareBins_code :

In [20]:
all_data['FareBins'] = pd.qcut(all_data.Fare, 5)
print(all_data['FareBins'].unique())
label = LabelEncoder()

all_data['FareBins_code'] = label.fit_transform(all_data['FareBins'])

train_data['FareBins_code'] = all_data['FareBins_code'][:891]
test_data['FareBins_code'] = all_data['FareBins_code'][891:]

[(-0.001, 7.854], (41.579, 512.329], (7.854, 10.5], (10.5, 21.558], (21.558, 41.579]]
Categories (5, interval[float64, right]): [(-0.001, 7.854] < (7.854, 10.5] < (10.5, 21.558] < (21.558, 41.579] < (41.579, 512.329]]


In [21]:
all_data['FareBins_code'].unique()

array([0, 4, 1, 2, 3])

- 0 : (-0.001, 7.854]
- 1 : (7.854, 10.5]
- 2 : (10.5, 21.558]
- 3 : (21.558, 41.579]
- 4 : (41.579, 512.329]

In [22]:
all_data[['FareBins', 'FareBins_code']].head(10)

Unnamed: 0,FareBins,FareBins_code
0,"(-0.001, 7.854]",0
1,"(41.579, 512.329]",4
2,"(7.854, 10.5]",1
3,"(41.579, 512.329]",4
4,"(7.854, 10.5]",1
5,"(7.854, 10.5]",1
6,"(41.579, 512.329]",4
7,"(10.5, 21.558]",2
8,"(10.5, 21.558]",2
9,"(21.558, 41.579]",3


* #### **Sex**
We will encode the Sex column into a new column- Sex_cat, where 0 = female and 1 = male.

In [23]:
all_data["Sex"] = all_data["Sex"].astype('category')
all_data["Sex_cat"] = all_data["Sex"].cat.codes

train_data['Sex_cat'] = all_data['Sex_cat'][:891]
test_data['Sex_cat'] = all_data['Sex_cat'][891:]

* #### **Adding Family_Size**

In [24]:
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch']

# Substituting the values in train_data and test_data:
train_data['Family_Size'] = all_data['Family_Size'][:891]
test_data['Family_Size'] = all_data['Family_Size'][891:]

* #### **Adding Family_Survival**
Grouping families and people with the same tickets together and researches the info.

In [25]:
all_data['Last_Name'] = all_data['Name'].apply(lambda x: str.split(x, ",")[0])

In [26]:
DEFAULT_SURVIVAL_VALUE = 0.5
all_data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in all_data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if len(grp_df) != 1: # A Family group is found.
        
        for ind, row in grp_df.iterrows():
            surv_max = grp_df.drop(ind)['Survived'].max()
            surv_min = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            
            if surv_max == 1.0:
                all_data.loc[all_data.PassengerId == passID, 'Family_Survival'] = 1
            elif surv_min == 0.0:
                all_data.loc[all_data.PassengerId == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      all_data.loc[all_data['Family_Survival'] != 0.5].shape[0])

Number of passengers with family survival information: 420


In [27]:
for grp, grp_df in all_data.groupby('Ticket'):
    if len(grp_df) != 1:
        
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival'] == 0.5):
                surv_max = grp_df.drop(ind)['Survived'].max()
                surv_min = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']

                if surv_max == 1.0:
                    all_data.loc[all_data.PassengerId == passID, 'Family_Survival'] = 1
                elif surv_min == 0.0:
                    all_data.loc[all_data.PassengerId == passID, 'Family_Survival'] = 0
print("Number of passenger with family/group survival information: " ,
      (all_data[all_data['Family_Survival']!=0.5].shape[0]))

Number of passenger with family/group survival information:  546


In [28]:
# # Family_Survival in TRAIN_DATA and TEST_DATA:
train_data['Family_Survival'] = all_data['Family_Survival'][:891]
test_data['Family_Survival'] = all_data['Family_Survival'][891:]

* #### **Cleaning Data**

In [29]:
all_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'AgeGroup',
       'AgeGroup_code', 'FareBins', 'FareBins_code', 'Sex_cat', 'Family_Size',
       'Last_Name', 'Family_Survival'],
      dtype='object')

In [30]:
train_data.drop(['PassengerId','Name', 'Sex', 'Age', 'SibSp',
                 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1, inplace = True)

test_data.drop(['PassengerId','Name', 'Sex', 'Age', 'SibSp',
                 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1, inplace = True)

In [31]:
print("Train data column names: ", train_data.columns)
print("\nTest data column names: ", test_data.columns)

Train data column names:  Index(['Survived', 'Pclass', 'AgeGroup_code', 'FareBins_code', 'Sex_cat',
       'Family_Size', 'Family_Survival'],
      dtype='object')

Test data column names:  Index(['Pclass', 'AgeGroup_code', 'FareBins_code', 'Sex_cat', 'Family_Size',
       'Family_Survival'],
      dtype='object')


### From now on we will continue to work with the training set (train_data) and not with all the data

There are no duplicates:

In [32]:
train_data.duplicated().any()

True

### Some statistics information:

In [33]:
print(train_data.AgeGroup_code.unique())


[0 1 3 2]


In [34]:
train_data.describe(include='all')

Unnamed: 0,Survived,Pclass,AgeGroup_code,FareBins_code,Sex_cat,Family_Size,Family_Survival
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.450056,1.98541,0.647587,0.904602,0.519641
std,0.486592,0.836071,0.992402,1.411355,0.47799,1.613459,0.323961
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,1.0,0.0,0.0,0.5
50%,0.0,3.0,0.0,2.0,1.0,0.0,0.5
75%,1.0,3.0,0.0,3.0,1.0,1.0,0.5
max,1.0,3.0,3.0,4.0,1.0,10.0,1.0


## ---ADD HERE SUPPORT PLOTS---

 #### According to the descriptive statistics table above-

**Survived:**
- Most of the passengers didn't survived (as we already know...), the mean = 0.3838 and the median = 0.

**Ticket Class (Pclass):**
- The mean = 2.3 and the median = 3, indicates that most tickets are for third class, and the minority are for first class.

**Age/AgeGroup_code**
- The average AgeGroup_code is 0.45 and the median is 0, with a std of ~0.99. Most of the passengers are adults.

**Fare/FareBins_code**
- It seems that most of the passengers paids for the trip between 10.5$ to 21.558$.

**Sex**
- Most of passengers are males.

**Family Size:**
- 

**Number of Siblings/Spouses (SibSp)**
- According to the mean and the median, most of the passengers came without their sibilig or spouses.

**Number of Parents/Children (Parch)**
- According to the mean and the median, most of the passengers came without their parents or children.



**Cabin**
- The mode of the cabin numer is unknown.

**Embarked**
- There three port of embarkation: C = Cherbourg, Q = Queenstown, S = Southampton. The common one is Southampton.

In [35]:
# corr = train_data.corr()
# mask = np.triu(np.ones_like(corr))

# sns.heatmap(corr, mask = mask, annot = True)

We can see a moderately negative correlation between the Sex variable ( = Sex_cat) and the Survived variable.

### Data Visualization
Helps us to understand better how the data are behaving

In [36]:
# plt.figure(figsize = (15,20))

# plt.subplot(3, 3, 1)
# sns.countplot(x = 'Survived', data = train_data)
# plt.title("How Many Survived")

# plt.subplot(3, 3, 2)
# sns.countplot(x = "Sex", data = train_data)
# plt.title("Sex Counting")

# plt.subplot(3, 3, 3)
# sns.histplot(x = "Age", data = train_data, kde = True)
# plt.title("Age Distribution")

# plt.subplot(3, 3, 4)
# sns.countplot(x = 'Pclass', data = train_data)
# plt.title('Ticket Class')

# plt.subplot(3, 3, 5)
# sns.countplot(x = 'SibSp', data = train_data)
# plt.title("# of siblings / spouses\naboard the Titanic")

# plt.subplot(3, 3, 6)
# sns.countplot(x = 'Parch', data = train_data)
# plt.title('# of parents / children\naboard the Titanic')

# plt.subplot(3, 3, 7)
# sns.histplot(x = 'Fare', data = train_data, kde = True)
# plt.title('Passenger fare')

#### We want to evaluate which features are affecting the chance to survive.
#### Accordingly, let's visually examine the connection.

**Survived ~ Sex**

In [37]:
# women = train_data.loc[train_data.Sex == "female"]['Survived']
# rate_women = sum(women) / len(women)

# men = train_data.loc[train_data.Sex == "male"]['Survived']
# rate_men = sum(men) / len(men)

# print("% of men who survived:", rate_men)
# print("% of women who survived:", rate_women)

# sns.countplot(x = 'Sex', data = train_data, hue = 'Survived')

**Survived ~ Ticket Class (Pclass)**

In [38]:
# class1 = train_data.loc[train_data.Pclass == 1]['Survived']
# rate_class1 = sum(class1) / len(class1)

# class2 = train_data.loc[train_data.Pclass == 2]['Survived']
# rate_class2 = sum(class2) / len(class2)

# class3 = train_data.loc[train_data.Pclass == 3]['Survived']
# rate_class3 = sum(class3) / len(class3)

# print("% of class 1 who survived:", rate_class1)
# print("% of class 2 who survived:", rate_class2)
# print("% of class 3 who survived:", rate_class3)

# sns.countplot(x = 'Pclass', data = train_data, hue = 'Survived')

**Survived ~ # of Parents/Children (Parch)**

In [39]:
# for i in train_data.Parch.unique():
#     temp_survived = train_data.loc[train_data.Parch == i]['Survived']
#     rate = sum(temp_survived) / len(temp_survived)
#     print("% of passengers who have ", i, " of parents/children aboard the Titanic that survived:", rate)
    
# sns.countplot(x = 'Parch', data = train_data, hue = 'Survived')
# plt.legend(title = 'Survived', loc='upper right')


**Survived ~ # of Sibiling/Spouses (SibSp)**

In [40]:
# for i in train_data.SibSp.unique():
#     temp_survived = train_data.loc[train_data.SibSp == i]['Survived']
#     rate = sum(temp_survived) / len(temp_survived)
#     print("% of passengers who have ", i, " of siblings/spouses aboard the Titanic that survived:", rate)
    

# sns.countplot(x = 'SibSp', data = train_data, hue = 'Survived')
# plt.legend(title = 'Survived', loc='upper right')

**Survived ~ Age**

In [41]:
# for i in train_data.AgeGroup.unique():
#     temp_survived = train_data.loc[train_data.AgeGroup == i]['Survived']
#     rate = sum(temp_survived) / len(temp_survived)
#     print("% of", i + 's', "that survived:", rate)

# sns.countplot(x = 'AgeGroup', data = train_data, hue = 'Survived')

###### The features- PassengerId, Name, Number of Ticket, Cabin, Embarked and Fare, are not affecting in any way the chance to survive or not.

# **Model Building**

### Data Preparation
Before using the classification models, let's prepered our data:

In [42]:
# X = train_data.loc[:, ("Sex_cat", "Pclass", "SibSp", "Parch", "Age")]
# y = train_data["Survived"].values

The feature matrix, X:

In [43]:
# X

The target column, y:

In [44]:
# y[:5]

#### Splitting the dataset into the Training set and Test set

In [45]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [46]:
# X_train

#### Feature Scaling

In [47]:
# # feature scaling
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

## **Feature and Model Selection**

In [48]:
# from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
# #cross_val_predict, KFold, 
# from sklearn.metrics import confusion_matrix, accuracy_score
# from sklearn.pipeline import Pipeline
# from sklearn.feature_selection import RFECV

In [49]:
# some experience....
# here we use Decision Tree as estimator in the RFECV
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# rfe = RFECV(estimator=LogisticRegression(), cv = 10)
# rfe = rfe.fit(X_train,y_train)
# print("Feature ranking: ", rfe.ranking_)

# # extracting the selected features.  get_support() function helps us to get those features names. 
# mask = rfe.get_support()
# features = X.columns
# best_features = features[mask]
 
# print("All features: ", X.shape[1])
# print(features)

# print("Selected best: ", best_features.shape[0])
# print(features[mask])

Let's run RFECV with different estimators to select features. Additionally, let's run some classification models, to get intuition about which estimator and classification algorithm will give us the best results.

After we chose the estimator and the model, we can check which features are selected and use GridSearchCV to find the best hyperparameters for the chosen model.

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [51]:
# # get a list of models to evaluate
# def get_models(modelc):
#     models = dict()
#     # lr
#     rfe = RFECV(estimator=LogisticRegression())
#     model = modelc
#     models['lr'] = Pipeline(steps=[('s',rfe),('m',model)])
#     # perceptron
#     rfe = RFECV(estimator=Perceptron())
#     model = modelc
#     models['per'] = Pipeline(steps=[('s',rfe),('m',model)])
#     # cart
#     rfe = RFECV(estimator=DecisionTreeClassifier())
#     model = modelc
#     models['cart'] = Pipeline(steps=[('s',rfe),('m',model)])
#     # rf
#     rfe = RFECV(estimator=RandomForestClassifier())
#     model = modelc
#     models['rf'] = Pipeline(steps=[('s',rfe),('m',model)])
#     # gbm
#     rfe = RFECV(estimator=GradientBoostingClassifier())
#     model = modelc
#     models['gbm'] = Pipeline(steps=[('s',rfe),('m',model)])
#     # svm
#     rfe = RFECV(estimator=SVC(kernel='linear'))
#     model = modelc
#     models['svm'] = Pipeline(steps=[('s',rfe),('m',model)])
#     return models

# # evaluate a give model using cross-validation
# def evaluate_model(model, X, y):
#     cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#     scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#     return scores

# models_dict = {'LR': LogisticRegression(random_state=0),
#                'SVM': SVC(kernel='linear', random_state=0),
#                'RF': RandomForestClassifier(),
#                'KNN': KNeighborsClassifier(),
#                'NB': GaussianNB()}

# # get the models to evaluate
# for i in models_dict.keys():
#     print('\n'+i+'\n')
#     models = get_models(models_dict[i])
#     # evaluate the models and store results
#     results, names = list(), list()
#     for name, model in models.items():
#         scores = evaluate_model(model, X_train, y_train)
#         results.append(scores)
#         names.append(name)
#         print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

O.K, let's continue with the two models with the highest accuracy score-
- Random Forest + Random Forest as estimator in the RFECV (accuracy score = 0.8).
- K-NN + Random Forest as estimator in the RFECV (accuracy score = 0.806).


## **Random Forest**
The feature that were selected:

In [52]:
# rf_rfe = RFECV(estimator=RandomForestClassifier())
# rf_pipe = Pipeline(steps=[('s',rf_rfe),('m',RandomForestClassifier())])

# X_new = rf_rfe.fit_transform(X_train, y_train)
# print("Feature ranking: ", rf_rfe.ranking_)

# mask = rf_rfe.get_support()
# features = X.columns
# best_features = features[mask]

# # print("All features: ", x.shape[1])
# # print(features)

# print("Selected best: ", best_features.shape[0])
# print(features[mask])

In [53]:
# X_train[:,mask]

Evaluate the model

In [54]:
# # evaluate model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(rf_pipe, X_new, y_train, scoring='accuracy', cv=cv, error_score='raise')
# # report performance
# print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Find the best hyperparameters using GridSearchCV

In [55]:
# # Criterion
# criterion = ['gini', 'entropy']
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'m__criterion': criterion,
#                'm__n_estimators': n_estimators,
#                'm__max_features': max_features,
#                'm__max_depth': max_depth,
#                'm__min_samples_split': min_samples_split,
#                'm__min_samples_leaf': min_samples_leaf,
#                'm__bootstrap': bootstrap}

In [56]:
'''#run very slow
rf_grid_search = GridSearchCV(rf_pipe,
                           param_grid = random_grid,
                           scoring = 'accuracy',
                           cv = cv)
rf_grid_search.fit(X_new, y_train)
best_accuracy = rf_grid_search.best_score_
best_parameters = rf_grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)'''

'#run very slow\nrf_grid_search = GridSearchCV(rf_pipe,\n                           param_grid = random_grid,\n                           scoring = \'accuracy\',\n                           cv = cv)\nrf_grid_search.fit(X_new, y_train)\nbest_accuracy = rf_grid_search.best_score_\nbest_parameters = rf_grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy*100))\nprint("Best Parameters:", best_parameters)'

In [57]:
# rf_clf = RandomForestClassifier(n_estimators=2000, criterion='gini')
# rf_clf.fit(X_new, y_train)
# y_pred = rf_clf.predict(X_test[:,mask])
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy_score(y_test, y_pred)

In [58]:
# X_train[:,[0,1,2,4]]

In [59]:
# X_new

Predict the test set

In [60]:
'''
# The next step:
rf_pred = rf_pipe.predict(X_test)
cm = confusion_matrix(y_test, rf_pred)
print(cm)
accuracy_score(y_test, rf_pred)
'''

'\n# The next step:\nrf_pred = rf_pipe.predict(X_test)\ncm = confusion_matrix(y_test, rf_pred)\nprint(cm)\naccuracy_score(y_test, rf_pred)\n'

## **K-NN**

In [61]:
# knn_rfe = RFECV(estimator=RandomForestClassifier())
# knn_pipe = Pipeline(steps=[('s',knn_rfe),('m',KNeighborsClassifier())])

# X_new_knn = knn_rfe.fit_transform(X_train, y_train)
# print("Feature ranking: ", rf_rfe.ranking_)

# mask = knn_rfe.get_support()
# features = X.columns
# best_features = features[mask]

# # print("All features: ", x.shape[1])
# # print(features)

# print("Selected best: ", best_features.shape[0])
# print(features[mask])

In [62]:
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(knn_pipe, X_new_knn, y_train, scoring='accuracy', cv=cv, error_score='raise')
# # report performance
# print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [63]:
# # Number of neighbors
# n_neighbors = [21]
# # Weight function used in prediction
# weights = ['uniform']
# # Algorithm used to compute the nearest neighbors
# algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# # Euclidean distance
# metric = ['minkowski']
# p = [2]

# # Create the knn hyperparameters grid
# knn_grid = {'m__n_neighbors': n_neighbors,
#                'm__weights': weights,
#                'm__algorithm': algorithm,
#                'm__metric': metric,
#                'm__p': p}

In [64]:
''' # run very slow
knn_grid_search = GridSearchCV(knn_pipe,
                           param_grid = knn_grid,
                           scoring = 'accuracy',
                           cv = cv)
rf_grid_search.fit(X_new_knn, y_train)
best_accuracy = knn_grid_search.best_score_
best_parameters = knn_grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)
'''

' # run very slow\nknn_grid_search = GridSearchCV(knn_pipe,\n                           param_grid = knn_grid,\n                           scoring = \'accuracy\',\n                           cv = cv)\nrf_grid_search.fit(X_new_knn, y_train)\nbest_accuracy = knn_grid_search.best_score_\nbest_parameters = knn_grid_search.best_params_\nprint("Best Accuracy: {:.2f} %".format(best_accuracy*100))\nprint("Best Parameters:", best_parameters)\n'

In [65]:
'''knn_pred = knn_pipe.predict(X_test[mask])
cm = confusion_matrix(y_test, knn_pred)
print(cm)
accuracy_score(y_test, knn_pred)'''

'knn_pred = knn_pipe.predict(X_test[mask])\ncm = confusion_matrix(y_test, knn_pred)\nprint(cm)\naccuracy_score(y_test, knn_pred)'

## **Model Selection**

To find the best model with the best parameters, let's use cross validation and grid search.

In [66]:
# models_accuracy_scores = {}
# best_params = {}

## **Logistic Regression**

Training the Logistic Regression model on the Training set

In [67]:
# logistic_classifier = LogisticRegression(random_state=0)
# logistic_classifier.fit(X_train, y_train)

Making the Confusion Matrix

In [68]:
# y_pred = logistic_classifier.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy_score(y_test, y_pred)

Feature Selection

In [69]:
# from sklearn.tree import DecisionTreeClassifier
# # create pipeline
# rfe = RFECV(estimator=LogisticRegression())
# model = LogisticRegression()
# pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# # evaluate model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
# n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, error_score='raise')
# # report performance
# print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [70]:
# rfe.fit(X_train, y_train)
# mask = rfe.get_support()
# features = X.columns
# best_features = features[mask]
 
# print("All features: ", X.shape[1])
# print(features)

# print("Selected best: ", best_features.shape[0])
# print(features[mask])

In [71]:
# model.fit(X_new, y_train)
# pip_pred = pipeline.predict(X_test)
# cm = confusion_matrix(y_test, pip_pred)
# print(cm)
# accuracy_score(y_test, pip_pred)

Applying k-Fold Cross Validation

In [72]:
# accuracies = cross_val_score(estimator = logistic_classifier, X = X_train, y = y_train, cv = 10)
# print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Applying Grid Search to find the best model and the best parameters

In [73]:
# parameters = [{'C': [0.01, 0.1, 0.15, 0.25, 0.5, 0.75, 1], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
#               {'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
#               {'C': [0.01, 0.1, 0.15,  0.25, 0.5, 0.75, 1], 'penalty': ['l2', 'l1'], 'solver': ['liblinear', 'saga']},
#               {'C': [0.01, 0.1, 0.15, 0.25, 0.5, 0.75, 1], 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.25, 0.5, 0.75]}]

# grid_search = GridSearchCV(estimator = logistic_classifier,
#                            param_grid = parameters,
#                            scoring = 'accuracy',
#                            cv = 10)
# grid_search.fit(X_train, y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
# print("Best Parameters:", best_parameters)

In [74]:
# grid_pred = grid_search.predict(X_test)
# accuracy_score(y_test, grid_pred)

In [75]:
# grid_search

In [76]:
# models_accuracy_scores["Logistic Regression"] = [logistic_score, logistic_std]

## **K-NN**

In [77]:
'''# search for an optimal value of k for K-NN
from sklearn.neighbors import KNeighborsClassifier
k_range = range(1,31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    scores = cross_val_score(knn, X_new_knn, y_train, cv = cv, scoring = 'accuracy')
    k_scores.append(scores.mean())
print(k_scores)'''

"# search for an optimal value of k for K-NN\nfrom sklearn.neighbors import KNeighborsClassifier\nk_range = range(1,31)\nk_scores = []\nfor k in k_range:\n    knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)\n    scores = cross_val_score(knn, X_new_knn, y_train, cv = cv, scoring = 'accuracy')\n    k_scores.append(scores.mean())\nprint(k_scores)"

In [78]:
'''# plot the value of k for K-NN (x axis) versus the cross-validated accuracy (y axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of k')
plt.ylabel('Cross-Validated Accuracy')'''

"# plot the value of k for K-NN (x axis) versus the cross-validated accuracy (y axis)\nplt.plot(k_range, k_scores)\nplt.xlabel('Value of k')\nplt.ylabel('Cross-Validated Accuracy')"

In [79]:
'''k_scores.index(max(k_scores))'''

'k_scores.index(max(k_scores))'

The optimal value of k is 27 with 0.786729 accuracy score

In [80]:
'''knn_classifier = KNeighborsClassifier(n_neighbors=26, metric='minkowski', p=2)
knn_accuracies = cross_val_score(knn_classifier, X, y, cv = 10, scoring = 'accuracy')
knn_score = knn_accuracies.mean()
knn_std = knn_accuracies.std()
print(knn_score)
models_accuracy_scores["K-NN"] = [knn_score, knn_std]'''

'knn_classifier = KNeighborsClassifier(n_neighbors=26, metric=\'minkowski\', p=2)\nknn_accuracies = cross_val_score(knn_classifier, X, y, cv = 10, scoring = \'accuracy\')\nknn_score = knn_accuracies.mean()\nknn_std = knn_accuracies.std()\nprint(knn_score)\nmodels_accuracy_scores["K-NN"] = [knn_score, knn_std]'

## **SVM**

In [81]:
# from sklearn.svm import SVC
# svm_classifier = SVC(kernel='linear', random_state=0)
# svm_accuracies = cross_val_score(svm_classifier, X, y, cv = 10, scoring = 'accuracy')
# svm_score = svm_accuracies.mean()
# svm_std = svm_accuracies.std()
# models_accuracy_scores["SVM"] = [svm_score, svm_std]

In [82]:
# models_accuracy_scores

## **Naive Bayes**

In [83]:
# from sklearn.naive_bayes import GaussianNB
# nb_classifier = GaussianNB()
# nb_accuracies = cross_val_score(nb_classifier, X, y, cv = 10, scoring = 'accuracy')
# nb_score = nb_accuracies.mean()
# nb_std = nb_accuracies.std()
# models_accuracy_scores["Naive Bayes"] = [nb_score, nb_std]

In [84]:
# nb_score

## **Random Forest**

In [85]:
# from sklearn.ensemble import RandomForestClassifier
# rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
# rf_accuracies = cross_val_score(rf_classifier, X, y, cv = 10, scoring = 'accuracy')
# rf_score = rf_accuracies.mean()
# rf_std = rf_accuracies.std()
# models_accuracy_scores["Random forest"] = [rf_score, rf_std]
# rf_score

In [86]:
# models_accuracy_scores

In [87]:
# from sklearn.ensemble import RandomForestClassifier

# y = train_data["Survived"]

# features = ["Pclass", "Sex", "SibSp", "Parch"]
# X = pd.get_dummies(train_data[features])
# X_test = pd.get_dummies(test_data[features])

# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# model.fit(X, y)
# predictions = model.predict(X_test)

# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# output.to_csv('submission.csv', index=False)
# print("Your submission was successfully saved!")

In [88]:
# output.head()