In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import tree
import plotly.express as px #Import plotly express for data visualization
import matplotlib.pyplot as plt     # Import the Matplotlib library for data visualization
import seaborn as sns   # Import the Seaborn library for statistical data visualization
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline #Import pipeline to automate functions
from sklearn.preprocessing import StandardScaler, MinMaxScaler #Import scaling functions
from sklearn.model_selection import train_test_split, GridSearchCV  # Import the train_test_split function for splitting data
from sklearn.metrics import  accuracy_score, confusion_matrix, classification_report, roc_auc_score # Import functions for evaluating models
from sklearn.linear_model import LogisticRegression     # Import the LogisticRegression class for logistic regression
from sklearn.tree import DecisionTreeClassifier     # Import the DecisionTreeClassifier class for decision tree classification
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing our dataset
df = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe(include="all")

In [None]:
df.isnull().sum()

In [None]:
#visualising the data
def count_plot(feature):
    # This function takes a feature as input and creates a count plot
    sns.countplot(x=feature, data=df)
    plt.show()
    print("\n")

In [None]:
columns = ['Survived','Pclass','Sex','SibSp','Embarked', 'Parch']
for i in columns:
    count_plot(i)

In [None]:
df["Age"].plot(kind='hist', title = "Age")

In [None]:
# Show pie chart of survival rate
survived_counts = df['Survived'].value_counts().reset_index()
survived_counts.columns = ['Survived', 'Count']
fig = px.pie(survived_counts, values='Count', names=['No', 'Yes'], title='Survived', labels={'Count': 'Count'}, color = ['No', 'Yes'])
fig.update_traces(textposition='inside',  textinfo='percent+label+value')
fig.update_layout(uniformtext_minsize=14, uniformtext_mode='hide')
fig.show()

In [None]:
# Show histogram chart of survival counts by gender
fig1 = px.histogram(df, x='Sex', color='Survived', barmode='group', color_discrete_map={0: "red", 1: "blue"})
fig1.update_layout(title='Sex: Survived vs Dead')
fig1.show()

In [None]:
# Show histogram chart of survival counts by Pclass
fig2 = px.histogram(df, x='Pclass', color='Survived', barmode='group', title='Pclass: Survived vs Dead', labels={'Pclass': 'Pclass'}, color_discrete_map={0: 'red', 1: 'blue'})
fig2.update_layout(title='PClass: Survived vs Dead')
fig2.show()

In [None]:
#data preprocessing
df.head()

In [None]:
df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# Fill missing values in age column by imputing the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
# Fill missing values in embarked column by imputing the mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

In [None]:
df.info()

In [None]:
# Transform categorical data into numerical data manually as there are only 2 to 3 values for each column
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 2} ).astype(int)
df['Embarked'] = df['Embarked'].map( {'S': 1, 'C': 2, 'Q': 3} ).astype(int)

In [None]:
#featuring exploring
#Understanding the relationship between all the features
sns.pairplot(df, hue='Survived')

In [None]:
# Let's check the correlation between the variables 
plt.figure(figsize=(20,18)) 
sns.heatmap(df.corr(), annot=True, linewidths=.5)

In [None]:
# Calculate the correlation list
target_corr = df.corr()['Survived'].abs().sort_values(ascending=False)
# Create a bar chart to visualize the correlations
plt.figure(figsize=(10, 6))
sns.barplot(x=target_corr.index[1:], y=target_corr.values[1:])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Correlation with diagnosis')
plt.title('Correlation between diagnosis and Features')
plt.tight_layout()
plt.show()

In [None]:
#splitting data and scaling it
X = df.drop("Survived", axis=1)
y = df["Survived"]

print(f"'X' shape: {X.shape}")
print(f"'y' shape: {y.shape}")

pipeline = Pipeline([
    ('min_max_scaler', MinMaxScaler()),
    ('std_scaler', StandardScaler())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=1)
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
#scoring function 
def print_score(clf, X_train, y_train, X_test, y_test, y_train_prob, y_test_prob, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True, zero_division=0))
        print("Train Result:\n")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        plt.figure(figsize=(10, 8))
        sns.heatmap(confusion_matrix(y_train, pred), annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        plt.show()
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True, zero_division=0))
        print("Test Result:\n")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
        plt.figure(figsize=(10, 8))
        sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        plt.show()

In [None]:
#logistic regression
param_grid = dict()
param_grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['penalty'] = ['l2'] #'none', 'l1', 'l2', 'elasticnet'
param_grid['C'] = loguniform.rvs(1e-5, 100, size=10)

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=1, cv=5)
model = grid.fit(X_train, y_train).best_estimator_

best_params = grid.best_params_
print(f"Best params: {best_params}")

y_train_prob = model.predict_proba(X_train)[:, 1]
y_test_prob = model.predict_proba(X_test)[:, 1]

print_score(model, X_train, y_train, X_test, y_test, y_train_prob, y_test_prob, train=True)
print_score(model, X_train, y_train, X_test, y_test, y_train_prob, y_test_prob, train=False)


In [None]:
#decission tree
param_grid = {"max_depth": [1, 2, 3, None],
              "max_features": [i for i in range(1, 10, 1)],
              "min_samples_leaf": [i for i in range(1, 10, 1)],
              "criterion": ["gini", "entropy"]}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit=True, verbose=1, cv=5)
model = grid.fit(X_train, y_train).best_estimator_

best_params = grid.best_params_
print(f"Best params: {best_params}")

y_train_prob = model.predict_proba(X_train)[:, 1]
y_test_prob = model.predict_proba(X_test)[:, 1]

print_score(model, X_train, y_train, X_test, y_test, y_train_prob, y_test_prob, train=True)
print_score(model, X_train, y_train, X_test, y_test, y_train_prob, y_test_prob, train=False)
plt.figure(figsize=(20, 18))
tree.plot_tree(model, feature_names = df.columns.tolist()[1:], filled=True, class_names=["Died", "Survived"])
plt.show()