<a href="https://colab.research.google.com/github/Nithyon/FC24LINEARREGRESSION/blob/main/Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

---


In [1]:
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

sns.set_style('darkgrid')

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

Importing dataset

In [3]:
df = sns.load_dataset('titanic')

Exploring the data

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.isna().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [6]:
plt.figure(figsize=(10, 5));
survived = df.survived.value_counts()
survived.index = ['survived', "didn't survive"]
sns.barplot(survived.index, survived);

TypeError: barplot() takes from 0 to 1 positional arguments but 2 were given

<Figure size 1000x500 with 0 Axes>

In [None]:
plt.figure(figsize=(10, 5))
df.embarked.value_counts().plot.barh();
plt.title('Embarked');

In [None]:
plt.figure(figsize=(10, 5))
df['class'].value_counts().plot.barh();
plt.title('Class');

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='alive', data=df, hue='sex');

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(x='age', data=df, bins=20);
plt.title('Age');

In [None]:
plt.figure(figsize=(10, 5))
df.age.value_counts().nlargest(11).plot.barh();
plt.xlabel('Count of Age')
plt.ylabel('Age');

In [None]:
df.corr().iloc[0, 1]

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True);

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='alone', data=df, hue='sex');

Improving the data

In [None]:
df['embarked'].fillna(value=df['embarked'].mode()[0], inplace=True)
df['embark_town'].fillna(value=df['embark_town'].mode()[0], inplace=True)

In [None]:
def get_label_class(val):
    if val == 'First':
        val = 0
    elif val == 'Second':
        val = 1
    elif val == 'Third':
        val = 2
    return val

def get_label_embarked(val):
    if val == 'S':
        val = 0
    elif val == 'C':
        val = 1
    elif val == 'Q':
        val = 2
    return val

def get_label_embarktown(val):
    if val == 'Southampton':
        val = 0
    elif val == 'Cherbourg':
        val = 1
    elif val == 'Queenstown':
        val = 2
    return val

In [None]:
df['sex'] = np.where(df['sex'] == 'male', 1, 0)

df['class'] = df['class'].map(get_label_class)

df.drop(['who'], axis=1, inplace=True)

df['embarked'] = df['embarked'].map(get_label_embarked)

df['adult_male'] = np.where(df['adult_male'] == True, 1, 0)

df['embark_town'] = df['embark_town'].map(get_label_embarktown)

df.drop(['deck'], axis=1, inplace=True)

df.drop(['alive'], axis=1, inplace=True)

df['alone'] = np.where(df['alone'] == True, 1, 0)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x='age', data=df);
plt.title('Age');

In [None]:
upper_value = df['age'].mean() + (3 * df.age.std())
lower_value = df['age'].mean() - (3 * df.age.std())

print(upper_value)
print(lower_value)

In [None]:
df.loc[df.age > upper_value, 'age'] = upper_value

In [None]:
sns.boxplot(x='age', data=df);

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True);

In [None]:
df.drop(['adult_male', 'embarked'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(df.fare);

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x='fare', data=df);
plt.title('Fare')

In [None]:
IQR = df.fare.quantile(0.75) - df.fare.quantile(0.25)

lower_value = df.fare.quantile(0.25) - (IQR * 1.5)
upper_value =  df.fare.quantile(0.75) + (IQR * 1.5)

# lower_value_extreme = df.fare.quantile
print(lower_value)
print(upper_value)

In [None]:
df.loc[df.fare > upper_value, 'fare'] = upper_value

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(df.fare);

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x='fare', data=df);

In [None]:
df.age = df.age.fillna(value=df.age.mean())

In [None]:
df.isna().sum()

In [None]:
X = df.loc[:, 'pclass':]
y = df.loc[:, 'survived']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X = X.values
y = y.values

Using Standard Scalar to normalise our data

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

Using Keras to train our model, As Keras uses TensorFlow in backend, we have imported Sequential and Dense

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=128, activation='relu', input_dim=9))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(units=64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(units=32, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid')) #adding layers to our model

In [None]:
model.summary()

We have added layers to our model, now its time to compile it


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='val_loss',
)

Now we will train our data, epoch=100 so that the model dosen't over fit


In [None]:
history = model.fit(X_train, y_train, batch_size=32, callbacks=[callback], validation_split=0.2, epochs=100, verbose=1)

In [None]:
plt.figure(figsize=(10, 8));
plt.plot(history.history['loss'], label='train_loss');
plt.plot(history.history['val_loss'], label='val_loss');
plt.title('Loss Vs. Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Training Loss', 'Val Loss']);

In [None]:
plt.figure(figsize=(10, 10));
plt.plot(history.history['accuracy'], label='train_acc');
plt.plot(history.history['val_accuracy'], label='val_acc');
plt.title('Accuracy Vs. Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training Accuracy', 'Val Accuracy']);

Getting predictions from our model

In [None]:
pred = model.predict(X_test)

For the model to not predict values greater than 0.5 as 1

In [None]:
predic = []
for i in pred:
    if i > 0.5:
        predic.append(1)
    else:
        predic.append(0)

In [None]:
predic = np.array(predic)

In [None]:
print("----------------------------------------------------Accuracy------------------------------------------------------")
print(accuracy_score(y_test, predic))
print()

print("---------------------------------------------------Classification Report---------------------------------------------")
print(classification_report(y_test, predic))
print()

print("-------------------------------------------------Confustion Metrics----------------------------------------------------")
plt.figure(figsize=(10, 10));
sns.heatmap(confusion_matrix(y_test, predic), annot=True);

In [None]:
x = pd.DataFrame({'Actual': y_test, 'Predicted': predic}, columns=['Actual', 'Predicted'])

In [None]:
x.head(50)

In [None]:
len(df.columns)