# Titanic - Data Cleaning and Exploration

## Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

raw_train=pd.read_csv('train.csv')
raw_test=pd.read_csv('test.csv')

train=raw_train
test=raw_test

## Inspection

In [None]:
#print training dataset
train

In [None]:
#Get Summary Stats
train.describe()

In [None]:
#Get information about the Dataframe
train.info()

## Cleaning

In [None]:
train[train.duplicated()]

In [None]:
#Find number of Unique values in each column
train.nunique()

In [None]:
#Looking for Missing Data
train.isnull().sum()

In [None]:
#Check the two rows with missing Embarked values
train[pd.isnull(train['Embarked'])]

In [None]:
#We assign the modal value to the missing Embarked data (the [0] after mode because mode is a pandas series)
mode=train['Embarked'].mode()
train['Embarked'].fillna(mode[0],inplace=True)

In [None]:
#Show the replaced values
train[train['PassengerId'].isin([62,830])]

In [None]:
#Looking at the Correlation Matrix below, it seems Age is correlated with Pclass
train.corr()

#You could also encode other categorical variables to seem them in the matrix

In [None]:
#SibSp and Parch are also correlated with Age.
#We could use Regression with Age as target and Pclass, Parch and SibSp as Features.
#https://towardsdatascience.com/predict-missing-values-in-the-dataset-897912a54b7b

#But for now we will simple assign the mean age for the Pclass of the missing data rows

In [None]:
#Average age different for different Pclass
train.groupby('Pclass').mean()

In [None]:
#Replace Missing Vales of Age with Mean by Pclass
train['Age'] = train['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


#for test data
test['Age'] = test['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


In [None]:
test.isnull().sum()

In [None]:
#Replace Missing Values for Fare in test Data by Mean of Fare in train data
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#Design New Feature
train['Cabin_Class'] = train['Cabin'].str[0]

#for test
test['Cabin_Class'] = test['Cabin'].str[0]

In [None]:
train

In [None]:
pd.crosstab(train['Cabin_Class'], train['Pclass'], margins = False)

In [None]:
#Average Fares per Cabin Class
train.groupby(['Cabin_Class']).mean()

In [None]:
#Replace the Missing values in Cabin_Class with "U" for Unknown
train['Cabin_Class'].fillna('U',inplace=True)


#for test
test['Cabin_Class'].fillna('U',inplace=True)

### One Hot Ecoding Emabrked and Cabin_Class 

In [None]:
#Splitting Y from X before One Hot Encoding to ensure similar Encoding Dictionary and column numbers for train and test

y_train = train['Survived']
y_train


In [None]:
train.drop(columns=['Survived'], inplace=True)
train

In [None]:
from feature_engine.encoding import OneHotEncoder

#Create Instance and Fit
# drop_lastto return k-1, false to return k
ohe = OneHotEncoder(top_categories=None, variables=['Cabin_Class', 'Embarked'], drop_last=True)
ohe.fit(train)

#for test
ohe.fit(test)

In [None]:
#Transform
temp = ohe.transform(train)
train=temp
train

In [None]:
#for test
temp2 = ohe.transform(test)

test=temp2
test

In [None]:
train.isnull().sum()

In [None]:
#Cabin Feature does not carry much information since many Distinct values and majority values missing.
#Therefore, we drop it along with the other uninformative columns.

train.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)
train

In [None]:
#For test
test.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)
test

### Check for Multicollinearity

In [None]:
corr = train.corr().abs()

s = corr.unstack()
s

In [None]:
so = pd.DataFrame(s.sort_values(kind="quicksort",ascending=False))
so[so[0]!=1].head(10)

In [None]:
#The top 4 might be something to worry about but makes sense that they are the hot encoded features

### Outliers

In [None]:
#Scatter Plot of Age and Fare to check if older people buy expensive tickets

plt.scatter(train['Age'], train['Fare'])
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

In [None]:
#Show the outliers in Fare
train[train['Fare']>300]

In [None]:
#We could replace these outliers with the mean if we thought they were erroneous but we ignore them for now.

## Preparing Data for ML

### Encoding Categorical Variables

In [None]:
#Embarked and Cabin_Class already One Hot Encoded

In [None]:
#Sex is binary - so Ordinal Encoding
# Using Feature Engine because it allows us to code multiple variables at a time (sklearn doesn't) and allows us to 
#view the dictionary later (pandas doesn't)

from feature_engine.encoding import OrdinalEncoder

In [None]:
ordinal_enc = OrdinalEncoder(encoding_method='arbitrary', variables=['Sex'])

ordinal_enc.fit(train)

In [None]:
# in the encoder dict we can observe the numbers
# assigned to each category for all the indicated variables

ordinal_enc.encoder_dict_

In [None]:
# this is the list of variables that the encoder will transform

ordinal_enc.variables_

In [None]:
#Transform Train
train = ordinal_enc.transform(train)

train

In [None]:
#Transform Test using same Encoder object

test = ordinal_enc.transform(test)

test

## Logistic Regression

In [None]:
#Convert to nparrays
nptrain=train.values
nptest=test.values
y_train=y_train.values

In [None]:
#Import Class and create Instance
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(nptrain,y_train)

In [None]:
prediction=pd.DataFrame(model.predict(nptest))

In [None]:
prediction

### Formatting Results for Submission

In [None]:
result = pd.read_csv('test.csv')

In [None]:
result=result['PassengerId']

In [None]:
result=pd.DataFrame(result)
result

In [None]:
result['Survived']=prediction

In [None]:
result

In [None]:
#Make sure to select index=False when saving
pd.DataFrame(result).to_csv(r".\csv\1.submission.csv",index=False)