In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train_data = pd.read_csv('train.csv')

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.dtypes

In [None]:
100*train_data.isna().mean()

In [None]:
survival_count = train_data["Survived"].value_counts()
survival_rate = 100*survival_count/train_data.shape[0]
survival_data = pd.concat([survival_count, survival_rate], axis=1).reset_index()
survival_data.columns = ['Survived', 'Count', 'Percentage']
survival_data

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(survival_data, x=['Not-Survived', 'Survived'], y='Percentage',
            width=0.35)
plt.ylabel('Percentage Survived')
plt.xlabel('Survival Status')
plt.title('Percentage Survival of the Titanic Passengers')

In [None]:
def survival_acc_to_grp(df, colname):
    plt.figure(figsize=(5,3))
    survival_by_pclass = df[['Survived', colname]].value_counts().reset_index()
    sns.barplot(survival_by_pclass, x=colname, y='count'. hue='Survived')
    plt.ylabel('Number of Survived')
    plt.title(f'Survival According to {colname}')

In [None]:
survival_acc_to_grp(df, 'Pclass')

In [None]:
survival_acc_to_grp(df, 'Sex')


In [None]:
survival_acc_to_grp(df, 'Embarked')

In [None]:
def vars_acc_to_surv(df, colname):
    plt.figure(figsize=(5,3))
    sns.kdeplot(df, x=colname, hue='Survived')
    plt.title(f'{colname}distribution according to the survival')

In [None]:
vars_acc_to_surv(df, 'Age')

In [None]:
vars_acc_to_surv(df, 'Fare')

In [None]:
def count_acc_to_surv(df, colname):
    plt.figure(figsize=(5,3))
    df1 = df.groupby(['SibSp', 'Survived']).count()['PassengerId'].reset_index()
    sns.barplot(df1, x='SibSp', y='PassengerId', hue='Survived')
    plt.title(f'{colname}according to survival')
    plt.xlabel(f'{colname}')
    plt.ylabel('count')
    

In [None]:
count_acc_to_surv(df, 'SibSp')


In [None]:
count_acc_to_surv(df, 'Parch')

In [None]:
cont_vars = pd.melt(df, value_vars=['Age', 'Fare', 'SibSp', 'Parch'])

plt.figure(figsize=(5,3))
sns.boxenplot(cont_vars, x='variable', y='value')
plt.xlabel('Continuous Variables')
plt.ylabel('Values')
plt.title('Comparison of the continuous variables')


In [None]:
train_data = train_data.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1)
train_data.head()

In [None]:
train_data = train_data.dropna(subset= 'Fare')

In [None]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())

In [None]:
train_data.isnull().sum()

In [None]:
train_data_survived = train_data[train_data['Survived']==1]
train_data_not_survived = train_data[train_data['Survived']==0]

train_data_survived_resampled = train_data_survived.sample(n=train_data_not_survived.shape[0], replace=True, random_state=101)
train_data_balanced = pd.concat([train_data_not_survived, train_data_survived_resampled]).sample(frac=1, random_state=26)
train_data_balanced

In [None]:
train_data_balanced['Survived'].value_counts()

In [None]:
x = train_data_balanced.drop(['Survived'], axis=1)
y = train_data_balanced['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1055)

print(f'''
      X_train: {X_train.shape}
      X_test: {X_test.shape}
      y_train: {y_train.shape}
      y_test: {y_test.shape}''')

In [None]:
cont_cols = ['Age', 'Fare']
disc_cols = ['SibSp', 'Parch']
cat_cols = ['Pclass', 'Sex', 'Embarked']

continuous_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

discrete_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drops='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continuous_transformer, cont_cols),
        ('disc', discrete_transformer, disc_cols)
        ('cat', categorical_transformer, cat_cols)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,3))
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the  classificataion')
plt.xlabel('True Value')
plt.ylabel('Predicted Value')

In [None]:
with open('./model_logistic.pkl', 'wb') as fp:
    pickle.dump(model, fp)