In [None]:
import pandas as pd
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
df = pd.read_csv("../datasets/titanic_train.csv")
df.head(10)

# Part 1
## <i>Some dataset analysis and exploration</i>

### 1.1
<i>What amount of passengers were male and what amount of passangers were female?</i>

In [None]:
df['Sex'].value_counts()

### Answer
Male - <u>577</u><br>
Female - <u>314</u> 

### 1.2
<i>How many passengers were able to survive?</i>

In [None]:
survived_series = df['Survived'].value_counts()
survived_series

From the description of the dataset we can see that:<br>
0 = Didn't survive<br>
1 = Survived

In [None]:
survival_rate = survived_series[1] / (survived_series[0] + survived_series[1])
survival_rate

In [None]:
survival_percent = round(survival_rate*100, 2)
survival_percent

### Answer
There was a <u>38,38%</u> of survivors

### 1.3
<i>What was the ratio of the passengers from the first class to all passengers?</i>

In [None]:
pclass_series = df['Pclass'].value_counts()
pclass_series

In [None]:
first_class_rate = pclass_series[1] / (pclass_series[1] + pclass_series[2] + pclass_series[3])
first_class_rate

In [None]:
first_class_percent = round(first_class_rate*100, 2)
first_class_percent

### Answer
There was a <u>24,24%</u> of passengers of the first class

### 1.4
<i>What was the average and median age of the passengers?</i>

In [None]:
age_series = df["Age"]
round(age_series.mean(), 2)

In [None]:
age_series.median()

### Ответ
<u>29,7</u> - average age<br>
<u>28</u> - median age

### 1.5
<i>Is there a correlation between a number of brothers/sisters/spouse and a number of parents/children?<br> 
Count a Pearson's correlation between "SibSp" and "Parch" variables.</i>

<b>Link to the doc on corr() method in pandas:</b><br>
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html<br>

<b>Link to a theory description:</b><br>
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

<b>Some description about which correlation value is significant:</b><br>
http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/

Also you can read about Pearson's correlation in book <b>"ThinkStats2"</b> (free pdf book) in chapter <b>"7.5 Pearson's correlation"</b>.<br>
http://greenteapress.com/wp/think-stats-2e/

In [None]:
correlation_df = df[['SibSp', 'Parch']]
correlation_df.head(10)

In [None]:
correlation_df.corr(method='pearson')

### Ответ
Correlation value - <u>0.414838</u><br><br>
It means that <u>when "SibSp" variable is high, the "Parch" variable is also tends to be high and vice versa</u>.<br><br>
The value 0.414838 says to us that <u>there is a correlation between these variables, but it's not very strong</u><br><br> 
<i>(it is common to say that there is a strong correlation between variables if the Pearson's correlation is between 0.5 and 1)</i>

### 1.6.
<i>What was the most popular female name on the ship?<br>
Extract from the full name of passengers (variable "Name") theirs personal names (variable "First Name")</i>

In [None]:
women_df = df[df['Sex'] == "female"]
women_df.head()

Let's see what womens' prefixes are there in the dataset.<br>
It will help us to make a rule by which we will extract personal names.

In [None]:
def get_prefix_set(name_str):
    name_list = name_str.split(" ")
    # Return only word with a dot
    for word in name_list: 
        if "." in word:
            return word

prefix_array = women_df["Name"].apply(get_prefix_set).unique()
prefix_array

Let's now print some examples of every category of prefixes to formulate the rules

In [None]:
names_series = women_df["Name"]
for prefix in prefix_array:
    print("Prefix = {}".format(prefix))
    # regex=False for using strict search (using python "in" operator underneath)
    print(names_series[names_series.str.contains(prefix, regex=False)].head(3))
    print()

#### Let's now formulate our rule based on examples:
- <b>Mrs.</b> - то брать первое, что в скобках<br>
- <b>Miss.</b> or <b>Mme.</b> - то брать то, что после Miss or Mme<br>

In [None]:
def get_first_name(name_str, prefix_array):
    name_list = name_str.split(" ")
    
    for prefix in prefix_array:
        if prefix in name_list:
            if(prefix == "Mme." or prefix == "Miss." or 
               prefix == "Ms." or prefix == "Mlle." or prefix == "Dr."):
                return name_list[name_list.index(prefix) + 1]
            elif prefix == "Mrs." or prefix == "Lady." or prefix == "Countess.":
                if name_str.find("(") != -1:
                    name = name_str[name_str.index("(")+1:]
                    if name.find(" ") != -1:
                        name = name[:name.index(" ")]
                    else:
                        name = name[:name.index(")")]
                    return name
                else:
                    return name_list[name_list.index(prefix) + 1]

names_list = women_df["Name"].apply(get_first_name, args=(prefix_array,)).tolist()
names_list[:10] # show 10 elements

In [None]:
Counter(names_list).most_common(10)

In [None]:
N_NAMES = 10

top = Counter(names_list).most_common(N_NAMES)
top_names = [i[0] for i in top]
top_values = [i[1] for i in top]

plt.xticks(range(N_NAMES), top_names, rotation='vertical')
plt.scatter(range(N_NAMES), top_values)
for i in range(N_NAMES):
    plt.annotate(top_values[i], xy=(i,top_values[i]))
plt.show()

### Answer
<b>Anna</b> was the most popular name on the ship<br>
<i>(of course if we assume Mary and Marie as different names :) )</i>

# Part 2.
## Features extraction and basic DecisionTreeСlassifier

### 2.1
<i>There are a missing values in data - for example, for certain passengers age is missing.<br>
Drop all samples which contain nan values in any of these variables - 'Pclass', 'Fare', 'Age', 'Sex'
</i>

In [None]:
df.dropna(inplace=True, subset=['Pclass', 'Fare', 'Age', 'Sex'])

### 2.2
<i>Leave in dataset only 4 variables:<br> 
- a passenger's class (Pclass)
- price of a ticket (Fare)
- a passenger's age (Age)
- a passenger's sex (Sex) 
</i>

In [None]:
df2 = df[['Pclass', 'Fare', 'Age', 'Sex']].copy()
df2.head()

### 2.3
<i>Convert "Sex" variable from string to integer type</i>

In [None]:
df2['Sex'] = np.where(df2['Sex'] == 'male', 1, 0)
df2.head()

### 2.4
<i>Select the target variable — "Survived"</i>

In [None]:
target_variable = df['Survived']
target_variable.head()

### 2.5 
<i>Let's for example teach a basic decision tree with random_state=241 and all other arguments as default<i>

<b>Link to "Understanding the decision tree structure":</b><br>
http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html<br>
<b>"Desition tree" in Scikit Learn docs</b><br>
http://scikit-learn.org/stable/modules/tree.html

In [None]:
estimator = DecisionTreeClassifier(random_state=241)
estimator.fit(df2, target_variable)

In [None]:
accuracy_score(target_variable, estimator.predict(df2))

<b>Accuracy</b> of our model is <u>98%</u> which is very high.<br>
But it's important to understand that we performed testing of our model only on training data.<br> This score doen't tell us how our model works on a new data.<br>
<br>
Moreover, there is a high risk that our model is <b>overfitted</b> (or <b>overlearned</b>) which means that it corresponds too closely or exactly to our initial dataset.<br>
<br>
In the next part we will:
- <b>split our training set</b> into separate test and training datasets  
- perform <b>feature selection</b>, which will help us to reduce amount of variables by which we will train our model
- perform <b>cross validation</b>,  which will help us to more accurately count accuracy of trained model

# Part 3
## Applying machine learning

### 3.1
<i>Find two most important features in dataset.</i>

This task is called <b>feature selection</b>.<br>
Documentation on <b>SelectKBest</b>:<br>
http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

In [None]:
print("Initial df2 shape: {}".format(df2.shape))
print("df2: \n{}".format(df2.head(10)))

# Create and fit selector
selector = SelectKBest(k=2)
selector.fit(df2, target_variable)

# Get ids of columns to keep
ids_selected = selector.get_support(indices=True)

# Create new dataframe with only desired columns, or overwrite existing
df2_reduced = df2.iloc[:,ids_selected]

print("New df2 shape: " + str(df2_reduced.shape))
df2_reduced.head()

In [None]:
df2_reduced.columns

### Answer
<b>'Pclass' и 'Sex'</b> - two most important features based on method <b>SelectKBest</b> and statistics test <b>k-classif</b>

### 3.2
<i>Train a model using two main features which predicts, will a person survive a Titanic sinking or not.</i>

First we will <b>divide</b> our reduced dataframe df2_reduced (dataframe with only 2 main features) <b>into test and train datasets</b> in proportion, for example 70-30%.

Also let's make an <b>explicit random_state = 241</b> for sake of definite reproduction of results. 

In [None]:
my_rand_state = 241

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df2_reduced, target_variable, 
                                                    test_size=0.3, random_state=my_rand_state)

Let's train <b>DecisionTreeClassifier</b> with random_state=241 once again, but this time using reduced dataset and testing its accuracy on testing dataset (which wasn't used in training).

In [None]:
estimator = DecisionTreeClassifier(random_state=my_rand_state)
estimator.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, estimator.predict(X_test))

<u>80,4%</u> - not a perfect result, but it's more representative than 98%, which we got using training data as a test data. 

### 3.3
<i>Train a model using «KNearestNeighbors» and «LogisticRegression»

<b>Scikit-learn KNearestNeighbors:</b><br>
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html<br>
<b>KNearestNeighbors in details:</b><br>
http://scikit-learn.org/stable/modules/neighbors.html<br>
<b>Scikit-learn LogisticRegression:</b><br>
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

<b>KNearestNeighbours</b> with amount of neighbors = 5

In [None]:
# Educate
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

# Predict
y_pred = neigh.predict(X_test)

# Print report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

<b>LogisticRegression</b> with default parameters (L2 as a penalty; liblinear as a solver because it is recommended for small datasets)

In [None]:
# Educate
regr = LogisticRegression()
regr.fit(X_train, y_train)

# Predict
y_pred = regr.predict(X_test)

# Print report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

### 3.4
Check the accuracy of models with <b>cross validation</b>.<br>

In [None]:
scores_neigh = cross_val_score(neigh, df2_reduced, target_variable, cv=5)
print("Cross-validated scores for each step: \n{}".format(scores_neigh))

In [None]:
scores_regr = cross_val_score(regr, df2_reduced, target_variable, cv=5)
print("Cross-validated scores for each step: \n{}".format(scores_regr))

### 3.5
<i>Compare accuracy of «KNearestNeighbors» and «LogisticRegression» in percents.</i>

<b>«KNearestNeighbors» accuracy</b>

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_neigh.mean(), scores_neigh.std() * 2))

<b>«LogisticRegression» accuracy</b>

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_regr.mean(), scores_regr.std() * 2))

### Answer
<b>«KNearestNeighbors» with amount of neighbors = 5</b> gives a 2% more accuracy using k=5 cross validation than a <b>«LogisticRegression»</b>.