# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load Data

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Processing

In [4]:
def titanic_preparation(df, means_to_use=None, scalers_to_use=None, ohe_columns=None):
    ##### BASICS #####
    df_processed = df.drop(columns=['PassengerId'])
    df_processed['Sex'] = df_processed['Sex'].apply(lambda val: 1 if val == 'male' else 0)
    df_processed.drop(columns=["Name"], inplace=True)
    
    ##### SIMPLE OHE #####
    ohe_cols_embarked = pd.get_dummies(df_processed['Embarked'], dummy_na=True, prefix='Embarked')
    df_processed = pd.concat([
        df_processed.drop(columns=['Embarked']),
        ohe_cols_embarked
    ], axis=1)
    
    ##### NAN #####
    if means_to_use is not None:
        fare_mean = means_to_use["Fare"]
        age_mean = means_to_use["Age"]
        new_means_to_use = means_to_use
    else:
        fare_mean = df_processed['Fare'].mean()
        age_mean = df_processed['Age'].mean()
        new_means_to_use = {
            "Fare": fare_mean,
            "Age": age_mean
        }
        
    df_processed['Fare'].fillna(value=fare_mean, inplace=True)
    df_processed['Age'].fillna(value=age_mean, inplace=True)
    
    #### IsChild ####
    df_processed['Is_Child'] = df_processed['Age'].apply(lambda age: 0 if age > 18 else 1)
    
    #### Family ####
    df_processed['Family'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    
    ###### STANDARDIZATION #####
    if scalers_to_use is not None:
        fare_scaler = scalers_to_use["Fare"]
        age_scaler = scalers_to_use["Age"]
        new_scalers_to_use = means_to_use
    else:
        fare_scaler = StandardScaler()
        fare_scaler.fit(df_processed['Fare'].values.reshape(-1,1))
        age_scaler = StandardScaler()
        age_scaler.fit(df_processed['Age'].values.reshape(-1,1))
        new_scalers_to_use = {
            "Fare": fare_scaler,
            "Age": age_scaler
        }
        
    df_processed['Fare'] = fare_scaler.transform(df_processed['Fare'].values.reshape(-1,1)) 
    df_processed['Age'] = fare_scaler.transform(df_processed['Age'].values.reshape(-1,1)) 
    
    ##### HARD OHE #####
    def complete_other(row):
        return 1 if np.sum(row) == 0 else 0 
        
    ohe_ticket = pd.get_dummies(df_processed['Ticket'], prefix="Ticket", dummy_na=True)
    
    if ohe_columns is not None:
        ohe_ticket_columns_to_keep = ohe_columns["Ticket"]
        missing_tickets = [col for col in ohe_ticket_columns_to_keep if col not in ohe_ticket]
        for column in missing_tickets:
            ohe_ticket[column] = 0
    else:
        ohe_ticket_columns_to_keep = ohe_ticket.sum().sort_values(ascending=False)[:100].index
        
    ohe_ticket_final = ohe_ticket[ohe_ticket_columns_to_keep]
    ohe_ticket_final["Ticket_Other"] = ohe_ticket_final.apply(complete_other, axis=1)
    
    df_processed = pd.concat([
        df_processed.drop(columns=['Ticket']),
        ohe_ticket_final
    ], axis=1)
    
    ohe_cabin = pd.get_dummies(df_processed['Cabin'], prefix="Cabin", dummy_na=True)
    if ohe_columns is not None:
        ohe_cabin_columns_to_keep = ohe_columns["Cabin"]
        missing_cabin = [col for col in ohe_cabin_columns_to_keep if col not in ohe_cabin]
        for column in missing_cabin:
            ohe_cabin[column] = 0
    else:
        ohe_cabin_columns_to_keep = ohe_cabin.sum().sort_values(ascending=False)[:100].index
    
    ohe_cabin_final = ohe_ticket[ohe_ticket_columns_to_keep]
    ohe_cabin_final["Cabin_Other"] = ohe_ticket_final.apply(complete_other, axis=1)
    df_processed = pd.concat([
        df_processed.drop(columns=['Cabin']),
        ohe_cabin_final
    ], axis=1)
    
    ohe_columns = {
        'Ticket': ohe_ticket_columns_to_keep,
        'Cabin': ohe_cabin_columns_to_keep
    }
    
    
    
    ##### RETRUN #####
    return df_processed, new_means_to_use, new_scalers_to_use, ohe_columns

In [5]:
train_df, means, scalers, ohe_columns = titanic_preparation(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Train

In [6]:
X1 = train_df.drop(columns=['Survived'])
Y1 = train_df['Survived']

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [8]:
model = RandomForestClassifier(n_estimators=343, max_depth=11)

In [9]:
model.fit(X1, Y1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=343, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
predicted_train_tmp = model.predict(X1)
accuracy_score(Y1, predicted_train_tmp)

0.9034792368125701

In [11]:
np.mean(cross_val_score(model, X1, Y1, cv=5))

0.8261045603799152

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [13]:
X2 = train_df.drop(columns=['Survived'])
Y2 = train_df['Survived']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.3)

In [15]:
param_distributions = { 
    'n_estimators': randint(1,601),
    'max_features': ['auto'],
    'max_depth' : randint(1,51),
    'criterion' :['gini', 'entropy']
}

In [16]:
model_forest = RandomForestClassifier(random_state=0)

In [17]:
model_forest_grid = RandomizedSearchCV(estimator=model_forest, param_distributions=param_distributions,n_iter=100, cv= 5)

In [18]:
model_forest_grid.fit(X_train, Y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A0CD5AF908>, 'max_features': ['auto'], 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A0CD5AFA58>, 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [19]:
model_forest_grid.best_params_

{'criterion': 'entropy',
 'max_depth': 12,
 'max_features': 'auto',
 'n_estimators': 85}

In [20]:
train_forest_predicted = model_forest_grid.predict(X_train)

In [21]:
test_forest_predicted = model_forest_grid.predict(X_test)

In [22]:
print("Test Accuracy : " + str(accuracy_score(Y_test, test_forest_predicted)) + " %")

Test Accuracy : 0.8395522388059702 %


In [23]:
print("Train Accuracy : " + str(model_forest_grid.best_score_) + " %")

Train Accuracy : 0.8250401284109149 %


In [24]:
confusion_matrix(Y_test, test_forest_predicted)

array([[159,  15],
       [ 28,  66]], dtype=int64)

# Test

In [25]:
raw_test_df = pd.read_csv('test.csv')

In [26]:
test_df, _, _,_ = titanic_preparation(raw_test_df, means, scalers, ohe_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
test_predicted = model_forest_grid.predict(test_df)

In [28]:
submission_df = pd.concat([
        raw_test_df["PassengerId"], 
        pd.Series(test_predicted, name="Survived")
    ], axis=1)

In [29]:
submission_df.to_csv("Submission_titanic.csv", index=False)

## Meilleur score

{'criterion': 'entropy',
 'max_depth': 12,
 'max_features': 'auto',
 'n_estimators': 85}