In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('titanic_train.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

# Exploratory Data Analysis

Let's begin some of the exploratory data analysis! We'll start by checking out missing data!

# Missing Data

We can use seaborn to create simple heatmap to see where we are missing data!

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
sns.set_style('whitegrid')
sns.countplot(data = train,x='Survived')

In [None]:
sns.set_style('whitegrid')
sns.countplot(data = train,x='Survived',hue='Sex',palette='RdBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(data = train,x='Survived',hue='Pclass',palette='rainbow')

In [None]:
sns.displot(train['Age'].dropna(),kde=True,color='darkred',bins=40)

In [None]:
train['Age'].hist(bins=40,color='darkred',alpha=0.5)

In [None]:
sns.countplot(train,x='SibSp')

In [None]:
train['Fare'].hist(bins=40,color='green',figsize=(8,4))

### Cufflinks for plots

Let's take a quick moment to show an example of cufflinks!

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
train['Fare'].iplot(kind='hist',bins=30,color='green')

# Data Cleaning

In [None]:
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
train.drop('Cabin',axis=1,inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Converting Categorical Features

In [None]:
train.info()

In [None]:
pd.get_dummies(train['Embarked'],drop_first=True).head()

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [None]:
train.drop(['Sex','Embarked','Name','Ticket'],inplace=True,axis=1)

In [None]:
train.head()

In [None]:
pd.concat([train,sex,embark],axis=1)

In [None]:
train = pd.concat([train,sex,embark],axis=1)

# Building a Logistic Regression Model

## Train, Test, Split

In [None]:
train.drop(['Survived'],axis=1,inplace=False)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['Survived'],axis=1,inplace=False), train['Survived'], test_size=0.30, random_state=101)

# Training and Predicting

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
accuracy = confusion_matrix(y_test,predictions)

In [None]:
accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test,predictions)
accuracy