# Problem Statement

- It is our job to predict if a passenger survived the sinking of the Titanic or not.
- For each in the test set, you must predict a 0 or 1 value for the variable.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reading Datasets

In [None]:
train=pd.read_csv("/kaggle/input/titanic/train.csv")
test=pd.read_csv("/kaggle/input/titanic/test.csv")

**Short Description of Features **

- Survival    : 1 if the person survived, else 0
- PassengerId : Unique Id of a passenger.
- pclass      : Ticket class
- sex         : Sex
- Age         : Age in years
- sibsp       : # of siblings / spouses aboard the Titanic
- parch       : # of parents / children aboard the Titanic
- ticket      : Ticket number
- fare        : Passenger fare
- cabin       : Cabin number
- embarked    : Port of Embarkation

In [None]:
train_id=train["PassengerId"]
test_id=test["PassengerId"]
# storing unique id for future refrence

## Analysing Both Train and Test Datasets

## Training dataset

In [None]:
train.describe()

In [None]:
train.head()

### analysing data which are present in classes

In [None]:
print(train.Pclass.value_counts(dropna=False))
print("--"*50)
print(train.Embarked.value_counts(dropna=False))
print("--"*50)
print(train.SibSp.value_counts(dropna=False))
print("--"*50)
print(train.Parch.value_counts(dropna=False))
print("--"*50)

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
mat=train.corr()
fig,ax = plt.subplots(figsize = (10,10))
sns.heatmap(mat,annot = True, annot_kws={'size': 12})

- by heatmap we can easily see the dependancy of all the Featues on survival

## Testing Dataset

In [None]:
test.describe()

In [None]:
test.head()

In [None]:
print(train.Pclass.value_counts(dropna=False))
print("--"*50)
print(train.Embarked.value_counts(dropna=False))
print("--"*50)
print(train.SibSp.value_counts(dropna=False))
print("--"*50)
print(train.Parch.value_counts(dropna=False))
print("--"*50)

In [None]:
test.isnull().sum().sort_values(ascending=False)

# Feature Engineering

# Dealing with missing values

This process includes :
- As more than 50 % values of the Cabin column is  missing so we have to drop that column.
- filling the missing values of other columns.
- droping the columns which are not required in predictions .

In [None]:
del train["Cabin"]


In [None]:
train["Age"].fillna(train.Age.mean(),inplace=True)

In [None]:
train.describe()

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
#droping the 2 missing values of embarked
train.dropna(inplace=True)
# as we know  we only have 2 missing values in training data set.

## Testing Dataset

In [None]:
del test["Cabin"]

In [None]:
test["Age"].fillna(test.Age.mean(),inplace=True)

In [None]:
test.describe()

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
# As you can not remove test data we have to fill that missing value of fare
test.fillna(test.Fare.median(),inplace=True)
test.isnull().sum().sort_values(ascending=False)

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = train[train['Sex']=='female']
men = train[train['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend() 
ax.set_title('Male')

You can see that, Men have a high probability of survival when they are between 18 and 30 years old, which is also a little bit true for women but not fully. For women the survival chances are higher between 14 and 40. For men the probability of survival is very low between the age of 5 and 18, but that isn’t true for women. Another thing to note is that infants also have a little bit higher probability of survival.

## Training dataset

In [None]:
Y=train["Survived"]

In [None]:
del train["PassengerId"]
del test["PassengerId"]

In [None]:
del train["Survived"]

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
test.head()

In [None]:
final=pd.concat([train,test],axis =0)

In [None]:
final.shape

In [None]:
final.head()

In [None]:
def One_hot_encoding(columns):
    final_df=final
    i=0
    for fields in columns:
        df1=pd.get_dummies(final[fields],drop_first=True)#applying get_dummies and droping first column which is created
        
        final.drop([fields],axis=1,inplace=True)
        if i==0:
            final_df=df1.copy()
        else:           
            final_df=pd.concat([final_df,df1],axis=1)
        i=i+1
       
        
    final_df=pd.concat([final,final_df],axis=1)
        
    return final_df

In [None]:
columns=["Sex","Embarked","Pclass","Parch"]

In [None]:
df_final = One_hot_encoding(columns)

In [None]:
df_final.head()

In [None]:
df_final.drop("Name",axis=1,inplace=True)
df_final.drop("Ticket",axis=1,inplace=True)


In [None]:
df_final.head()

## Normalising datasets

In [None]:
from sklearn import preprocessing
# Get column names first
names = df_final.columns
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df_final)
df_final = pd.DataFrame(scaled_df, columns=names)

### assigning unique names to the columns

In [None]:
cols = []
count = 1
for column in df_final.columns:
    cols.append(count)
    count+=1
    continue
    
df_final.columns = cols

In [None]:
df_final.head()

## Again spliting datasets into train and test !!

In [None]:
df_train=df_final.iloc[:889,:]
df_test=df_final.iloc[889:,:]

In [None]:
X=df_train

In [None]:
df_test.shape

# Modeling

## Applying Random Foresting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_rforest = RandomForestClassifier()

In [None]:
model_rforest.fit(X_train,Y_train)

In [None]:
print("R-Squared Value for Training Set: {:.3f}".format(model_rforest.score(X_train,Y_train)))
print("R-Squared Value for Test Set: {:.3f}".format(model_rforest.score(X_test,Y_test)))

In [None]:
#predictions_01 = model_rforest.predict(df_test)

In [None]:
#output_01 = pd.DataFrame({'PassengerId': test_id, 'Survived': predictions_01})
#output_01.to_csv('my_submission_05.csv', index=False)
#print("Your submission was successfully saved!")

## Applying desicion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_dec = DecisionTreeClassifier()
model_dec.fit(X_train, Y_train)

In [None]:
print("R-Squared Value for Training Set: {:.3f}".format(model_dec.score(X_train,Y_train)))
print("R-Squared Value for Test Set: {:.3f}".format(model_dec.score(X_test,Y_test)))

In [None]:
predictions_02 = model_dec.predict(df_test)

# Submission

In [None]:
output_02 = pd.DataFrame({'PassengerId': test_id, 'Survived': predictions_02})
output_02.to_csv('my_submission_06.csv', index=False)
print("Your submission was successfully saved!")

# Do UPVOTE if you like it :)

## Your Suggestions are Welcome