## Z00. Notes - My Machine Learning Project

#### Resources

[Predicting Titanic Survival (Kaggle)](https://www.kaggle.com/c/titanic/data?#)  
[Full Titanic Dataset](http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/problem12.html)  

#### Modules

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling as pdpf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

%matplotlib inline

#### About

|Variable|	Definition|	       Key |
|--------|------------|------------|
|survival|	Survival	| 0 = No, 1 = Yes |
|pclass|	Ticket class	|1 = 1st, 2 = 2nd, 3 = 3rd |
|sex	|Sex	| |
|Age	|Age in years	| |
|sibsp	|# of siblings / spouses aboard the Titanic	| |
|parch	|# of parents / children aboard the Titanic	| |
|ticket	|Ticket number |	 |
|fare	|Passenger fare	| |
|cabin	|Cabin number	| |
|embarked|	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton | |

#### Exploration

In [None]:
# Importing the Data

df = pd.read_csv('./data/titanic.csv')
df.columns

In [None]:
# Exploring the Data

# pdpf.ProfileReport(df)
# attributes = ['Survived', 'Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard','Parents/Children Aboard', 'Fare']
# pd.plotting.scatter_matrix(df[attributes], figsize=(20, 14), s=4 )

#### ML Data Prep

In [None]:
# Manual transformation code

df_train, df_test = train_test_split(df, test_size=0.2)                            # Creating the train / test split

encoder = LabelEncoder()                                                           # Initialising LabelEncoder to convert gender to a numeric variable
sr_sex = df_train["Sex"]                                                           # Creating the category variable from the df
sr_sex_enc = encoder.fit_transform(sr_sex)                                         # Encoding the category

one_hot_variables = ['Pclass','Siblings/Spouses Aboard','Parents/Children Aboard'] # Variables to one-shot-one-hot encode
encoder = LabelBinarizer()                                                         # Initialising the one-shot-one-hot encoder
one_hot_outputs = [encoder.fit_transform(df_train[x]) for x in one_hot_variables]  # Creating a list of the one-shot-one-hot encoded variables

scaler = StandardScaler()                                                          # Initialising the scaler to scale the fare variable
sr_fare = df_train['Fare']                                                         # Creating a series for the Scaled fare
sr_fare = sr_fare.values.reshape(-1 ,1)                                            # Reshaping the fare
fare_scaled = scaler.fit_transform(sr_fare)                                        # Scaling the fare

In [None]:
# Dataframe selector

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
# Pipeline

train_labels = train["Survived"].copy() 

num_attribs = ['Survived','Age','Fare','Pclass','Siblings/Spouses Aboard','Parents/Children Aboard']
cat_attribs = ['Sex']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.svm import SVC, SVR, NuSVC, NuSVR

my_model = ExtraTreeClassifier()

titanic_prepared = full_pipeline.fit_transform(df_train)
my_model.fit(titanic_prepared, train_labels)

titanic_predictions = my_model.predict(titanic_prepared)
mse = mean_squared_error(train_labels, titanic_predictions)
rmse = np.sqrt(mse)

rmse_scores = cross_val_score(my_model, titanic_prepared, train_labels, scoring="neg_mean_squared_error", cv=10)
scores = np.sqrt(-rmse_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(scores)
print(rmse)