In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/titanic/train.csv')
df.head()

In [None]:
#Checking for null values

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(data = df.isnull(),yticklabels=False)

Roughly 20 percent of the Age data is missing. The proportion of Age missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later, or change it to another feature like "Cabin Known: 1 or 0"

In [None]:
df['Embarked']=df.Embarked.fillna('Missing_Embarked')

In [None]:
median = df.Age.median()

In [None]:
#Replacing the Null values in Age column with median
df['Age'] = df['Age'].fillna(median)

In [None]:
sns.heatmap(data = df.isnull(),yticklabels=False)

In [None]:
#As we can see the null values in the Age column have been filled

In [None]:
df['Cabin']=df.Cabin.fillna('Missing')

In [None]:
sns.heatmap(data = df.isnull(),yticklabels=False)

In [None]:
#All the null values have be sorted now

In [None]:
df.head(10)

In [None]:
#Getting to know the data visually now

In [None]:
sns.set_style('whitegrid')
sns.countplot(df['Survived'])

In [None]:
sns.set_style('whitegrid')
sns.countplot(df['Survived'],hue=df['Sex'])

In [None]:
sns.set_style('whitegrid')
sns.countplot(df['Survived'],hue=df['Pclass'])

In [None]:
sns.countplot(df['SibSp'])

In [None]:

df['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
#Removing categorical values

In [None]:
#As the name and ticket column does not help in any prediction, we will drop those columns from df

In [None]:
df.drop(columns=['Name','Ticket'],inplace = True)

In [None]:
#Checking the categorical data
for i in df:
    object_list = [i for i in df if df[i].dtype=='O']

In [None]:
object_list

In [None]:
for i in object_list:
    print('.................\n',df[i].value_counts(),'\n.................')

In [None]:
#As we can see Embarked and Sex have less no. of value count so we can apply one hot encoding here

In [None]:
sex_embarked = pd.get_dummies(df[['Sex','Embarked']],drop_first = True)

In [None]:
#Merge sex_embarked with df
df = pd.concat([df,sex_embarked],axis = 1)

In [None]:
df.head()

In [None]:
#Drop the original columns
df.drop(['Sex','Embarked'],inplace = True, axis = 1)

In [None]:
df

In [None]:
#Handling Cabin. As Cabin hai multiple values. We would use a method where we will take only the first
#inital of the word and replace the original value with it first.
#After that, we will apply one hot encoding

In [None]:
df['Cabin'] = df['Cabin'].str[0]

In [None]:
df.head()

In [None]:
Cabin_one = pd.get_dummies(df['Cabin'],drop_first=True)

In [None]:
df = pd.concat([df,Cabin_one],axis = 1)


In [None]:
#The Fare column is not important thus dropping it. Also dropping original cabin
df.drop(['Fare','Cabin'],inplace = True, axis = 1)

In [None]:
df.head()

In [None]:
#Great! Our data is ready for our model!

#Building a Logistic Regression model¶

In [None]:
#Train test split

In [None]:
X = df.drop('Survived',axis=1)

In [None]:
X.head()

In [None]:
Y = df['Survived']
Y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=101)

In [None]:
# Training and Predicting

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
accuracy=confusion_matrix(y_test,predictions)

In [None]:
accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy=accuracy_score(y_test,predictions)
accuracy

In [None]:
predictions

In [None]:
#Let's move on to evaluate our model!

#Evaluation

#We can check precision,recall,f1-score using classification report!

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
#Not so bad! You might want to explore other feature engineering and the other titanic_text.csv file, some suggestions for feature engineering:

#Try grabbing the Title (Dr.,Mr.,Mrs,etc..) from the name as a feature
#Is there any info you can get from the ticket?