In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
train_data.head().T

# Make a copy of the original dataframe
So when we manipulate the copy, we've still got our original data

In [None]:
# Make a copy of the original dataframe to perform edits on.
df_tmp = train_data.copy()

In [None]:
# Check the values of different columns
df_tmp.Ticket.value_counts()

In [None]:
df_tmp.head()

In [None]:
len(df_tmp)

In [None]:
df_tmp.info()

## Manipulate the Data into numbers

In [None]:
df_tmp["Name"].dtype

In [None]:
df_tmp.isna().sum()

## Convert string into categories
One way we can turn all of our data into numbers is by converting them into pandas categories

In [None]:
# Find the columns wich contain strings
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

In [None]:
df_tmp.info()

In [None]:
df_tmp.Name.cat.categories

In [None]:
df_tmp.Name.cat.codes

In [None]:
# Check the missing data
df_tmp.isnull().sum()/len(df_tmp)

In [None]:
df_tmp.isna().sum()

## Fill the missing values first
Fill numerical values first

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Fill numeric rows with the median
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells is the data is missing or not
            # df_tmp[label+"_is_missing"] = pd.isnull(content)
            # Fill the missing numeric values with median
            df_tmp[label] = content.fillna(content.median())

In [None]:
# Check if there's any null numeric values
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
df_tmp.isna().sum()

# Filling and turning categorical variables into numbers

In [None]:
# Check for columns which aren't numeric
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        # df_tmp[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        df_tmp[label] = pd.Categorical(content).codes+1

In [None]:
pd.Categorical(df_tmp["Cabin"]).codes+1

In [None]:
df_tmp.info()

In [None]:
df_tmp.head().T

In [None]:
df_tmp.isna().sum()

In [None]:
# Split Data into X and y
X = df_tmp.drop("Survived", axis=1)

y = df_tmp["Survived"]

In [None]:
X

In [None]:
y

In [None]:
# Import all the tools we need

import matplotlib.pyplot as plt
import seaborn as sns

# we want our plots to appear inside the notebook
%matplotlib inline 

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
#from sklearn.metrics import confusion_matrix, classification_report
#from sklearn.metrics import precision_score, recall_score, f1_score
#from sklearn.metrics import plot_roc_curve


In [None]:
# Split data into train and validation sets
np.random.seed(42)

# Split into train & test set
X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [None]:
X_train

In [None]:
y_train, len(y_train)


## Modelling

Now we've got our data into training and validation sets, it's time to build the model.

We'll train it (find the patterns) on the training set.

And we'll validate it (use the patterns) on the test set.



In [None]:
from sklearn.linear_model import SGDClassifier
 
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=70)
clf.fit(X, y)

clf.score(X, y)

## Let's try XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X, y)

xgb_model.score(X, y)

## Now Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(X, y)

gnb_model.score(X, y)

# Let's convert the test data

In [None]:
# First we'll copy the test data to protest the original data
df_test = test_data.copy()

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
len(df_test)

In [None]:
# Find the columns wich contain strings
for label, content in df_test.items():
    if pd.api.types.is_string_dtype(content):
        df_test[label] = content.astype("category").cat.as_ordered()

In [None]:
df_test.info()

In [None]:
# Check the missing data
df_test.isnull().sum()/len(df_test)

In [None]:
df_test.isna().sum()

In [None]:
# Fill numeric values first
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Fill numeric rows with the median
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells is the data is missing or not
            # df_tmp[label+"_is_missing"] = pd.isnull(content)
            # Fill the missing numeric values with median
            df_test[label] = content.fillna(content.median())

In [None]:
# Check if there's any null numeric values
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
df_test.isna().sum()

In [None]:
# Check for columns which aren't numeric
for label, content in df_test.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        # df_tmp[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        df_test[label] = pd.Categorical(content).codes+1

In [None]:
pd.Categorical(df_test["Cabin"]).codes+1

In [None]:
df_test.info()

In [None]:
df_test.isna().sum()

In [None]:
clf.predict(df_test)

In [None]:
#predictions = clf.predict(df_test)#

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('925A_submission.csv', index=False)
#print("Your submission was successfully saved!")

In [None]:
xgb_model.predict(df_test)

In [None]:
#predictions = xgb_model.predict(df_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('xgb_submission.csv', index=False)
#print("Your submission was successfully saved!")

In [None]:
gnb_model.predict(df_test)

In [None]:
#predictions = gnb_model.predict(df_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('gnb_submission.csv', index=False)
#print("Your submission was successfully saved!")

# Let's reformat the data with relevant information

In [None]:
# Let's look at the orignal format to test the data
# features = ["Pclass", "Sex", "SibSp", "Parch"]
#X = pd.get_dummies(train_data[features])
#X_test = pd.get_dummies(test_data[features])

In [None]:
# What does our data look like now?
X.info()

In [None]:
df_test.info()

# What are the data definitions?
Data Dictionary
Variable	Definition	    Key

survival	Survival	    0 = No, 1 = Yes

pclass	    Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd

sex	        Sex	

Age	        Age in years	

sibsp	    # of siblings / spouses aboard the Titanic	

parch	    # of parents / children aboard the Titanic	

ticket	    Ticket number	

fare	    Passenger fare	

cabin	    Cabin number	

embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

### In the original data the model only tested on these features: features = ["Pclass", "Sex", "SibSp", "Parch"] 
### Let's test our models on all of the features: ["Pclass", "Sex", "SibSp", "Parch", "Age", "ticket", "fare"]
We'll drop "Name" and "SibSp"

In [None]:
# Reformat
X = X.drop("Name", axis=1)
X = X.drop("SibSp", axis=1)
#X=  X.drop("Embarked", axis=1)
X.info()

In [None]:
# Reformat df_test
df_test = df_test.drop("Name", axis=1)
df_test = df_test.drop("SibSp", axis=1)
#df_test = df_test.drop("Embarked", axis=1)
df_test.info()

In [None]:
# Let's try our models again with the new reformatted data
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=70)
clf.fit(X, y)

clf.score(X, y)

# 63%

In [None]:
# Next
xgb_model = XGBClassifier()
xgb_model.fit(X, y)

xgb_model.score(X, y)

# Tough to tell. This one scored perfect the first time. Overfit, probably.

In [None]:
# Next
gnb_model = GaussianNB()
gnb_model.fit(X, y)

gnb_model.score(X, y)

# 79%


In [None]:
# Let's try this one again with increasing the iterations as recommended
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=150)
clf.fit(X, y)

clf.score(X, y)

# Even better. 6 percent up.

In [None]:
# Let's try RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(X, y)

rf.score(X, y)

# Yes! That looks good!
## Let's submit a test

In [None]:
predictions = rf.predict(df_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('rf_submission.csv', index=False)
print("Your submission was successfully saved!")

## Let's Tune the Hyperparamaters on the RandomForestModel. Best score so far, but still at 77%.

In [None]:
# Create a hyperparameter grid for RandomForestClassifier
#rf_grid = {"n_estimators": np.arange(10, 1000, 50),
#           "max_depth": [None, 3, 5, 10],
#          "min_samples_split": np.arange(2, 20, 2),
#         "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# Setup random seed
#np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier
#rs_rf = RandomizedSearchCV(RandomForestClassifier(), 
#                           param_distributions=rf_grid,
#                           cv=5,
#                           n_iter=20,
#                           verbose=True)

# Fit random hyperparameter search model for RandomForestClassifier()
#rs_rf.fit(X, y)

In [None]:
# Find the best hyperparameters
#rs_rf.best_params_

In [None]:
#rs_rf.score(X, y)

# Nice! 90%. Let's submit this one.

In [None]:
#predictions = rs_rf.predict(df_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('rs_rf_submission.csv', index=False)
#print("Your submission was successfully saved!")

## Okay, that one came back @ 77%. Hmmm.
# Let's try Logistic Regression and Hypertune parameters. 

In [None]:
#log_reg_model = LogisticRegression()
#log_reg_model.fit(X, y)

#log_reg_model.score(X, y)

## Not bad. 78% and it stopped. Let's tune it now.

In [None]:
#log_reg_grid = {"C": np.logspace(-4, 4, 20),
#                "solver": ["liblinear"]}

In [None]:
# Now we've set up a grid, let's hypertune!

#np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
#rs_log_reg = RandomizedSearchCV(LogisticRegression(),
#                                param_distributions=log_reg_grid,
#                                cv=5,
#                                n_iter=20,
#                                verbose=True)

# Fit random hyperparameter search model for LogisticRegression
#rs_log_reg.fit(X, y)

In [None]:
#rs_log_reg.best_params_

In [None]:
#rs_log_reg.score(X, y)

In [None]:
## Let's see if we can improve 80% with GridSearchCV
# Different hyperparameters for our LogisticRegression model
#log_reg_grid = {"C": np.logspace(-4, 4, 30),
#                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
#gs_log_reg = GridSearchCV(LogisticRegression(),
#                          param_grid=log_reg_grid,
#                          cv=5,
#                          verbose=True)

# Fit grid hyperparameter search model
#gs_log_reg.fit(X, y)

In [None]:
# Check the best hyperparmaters
#gs_log_reg.best_params_

In [None]:
# Evaluate the grid search LogisticRegression model
#gs_log_reg.score(X, y)

In [None]:
## Same, but's let's see what the actual comes back at us
#predictions = gs_log_reg.predict(df_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('gslogreg_submission.csv', index=False)
#print("Your submission was successfully saved!")

# 75% Let's try Decision Tree

In [None]:
#from sklearn.tree import DecisionTreeClassifier
#dtc_model = DecisionTreeClassifier(random_state=42)

#dtc_model.fit(X, y)

#dtc_model.score(X, y)

In [None]:
# Overfit? Let's submit anyway
#predictions = dtc_model.predict(df_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('dtc_model_submission.csv', index=False)
#print("Your submission was successfully saved!")