## All libraries we need
[link 1](https://eshitagoel.medium.com/eda-on-titanic-machine-learning-from-disaster-6b518bb97e17)
[link 2](https://medium.com/geekculture/applying-7-classification-algorithms-on-the-titanic-dataset-278ef222b53c)
[link 3](https://www.kaggle.com/code/tarekmuhammed/classification-prj-classify-survived-or-not)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 

In [None]:

# Load the dataset
path_to_data = 'Titanic-Dataset.csv'
df = pd.read_csv(path_to_data)
print(df.columns)  # Inspect column names


In [None]:
df.head()  # Показать первые 5 строк (по умолчанию) или указать количество строк, например: df.head(10)
#df.tail() 
#df.shape # rows, columns
#df.describe()
#df['currentEnergyRating'].nunique() 
#df.iloc[-5:, :]
#df.iloc[2:5,1:6]


In [74]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Age       889 non-null    float64
 3   SibSp     889 non-null    int64  
 4   Parch     889 non-null    int64  
 5   Fare      889 non-null    float64
 6   Embarked  889 non-null    int64  
 7   female    889 non-null    int64  
 8   male      889 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 69.5 KB


In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
# Cabin is the most columns has NAN Values 
df.Cabin.value_counts()

In [None]:
columns = [col for col in df.columns]

for col in columns:
    print("\n",df[col].value_counts(),"\n")

In [None]:
columns = [col for col in df.columns if df[col].dtype == 'object']

for col in columns:
    print("\n",df[col].value_counts(),"\n")

In [None]:
df.duplicated().sum()

In [None]:
sns.pairplot(df)
plt.title('Pairplot of Data')
plt.show()

In [None]:
custom_palette = ["#388087"]

# To show the outliers of each column
for col in df.select_dtypes(include=['int', 'float']).columns:
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.set_title(f'Boxplot of {col}')
    bp = sns.boxplot(data=df, y=col, ax=ax, color=custom_palette[0])  # Use 'color' for single color
    plt.show()
    plt.close(fig)


In [None]:
sns.histplot(data=df, x='Age', kde=True, bins=30, color='green')
plt.title('Distribution Plot of Age of Passengers')
plt.show()


In [None]:
sns.violinplot(hue='Sex', y='Survived', data=df, palette=custom_palette)
plt.title('Violinplot of a gender')
plt.show()


In [None]:
sns.violinplot(x='Sex', y='Age', data=df, hue='Sex', palette=custom_palette)
plt.title('Violinplot of Gender')
plt.show()


In [None]:
sns.histplot(data=df, x='Fare', kde=True, bins=80, color='green')
plt.title('Distribution Plot of Fare of Passengers')
plt.show()


In [None]:
custom_palette = [ "#66CDAA"]  
sns.set_palette(custom_palette)
sns.displot(data=df, x="Embarked")

In [None]:
sns.scatterplot(data=df, x="Sex",y="Pclass", hue="Sex", palette="viridis",s=250)
plt.xlabel('Pclass')
plt.ylabel('Gender')
plt.show()

In [None]:
sns.scatterplot(data=df, x="Embarked",y="Pclass", hue="Sex", palette="viridis",s=550)
plt.xlabel('Pclass')
plt.ylabel('Embarked')
plt.show()

## Preprocessing Operations :

In [None]:
# Drop unuseful columns :
df.drop(columns=["Name","PassengerId"],axis=1,inplace=True)

In [None]:
df.Cabin.isna().sum()

In [None]:
# Drop Columns with big amount of missed data and null values 
df.drop(columns=["Cabin"],axis=1,inplace=True)

In [None]:
# Fill the missed data with the mean in the age column 
df["Age"] = df["Age"].fillna(np.mean(df["Age"]))

In [None]:
# Drop the records with missed data ; because there are only 2 records
df=df.dropna()

In [None]:
df.info()

In [None]:
# To know every column will be encoded and how will be encoded
columns = [col for col in df.columns if df[col].dtype == 'object']

for col in columns:
    print("\n",df[col].value_counts(),"\n")

In [None]:
# Drop Ticket column because it has many values so it will be so hard to encode each value with label
df.drop("Ticket",axis=1,inplace=True)

## Encoding Categorical Data :

In [None]:
#Get dummies encoding
Get_Dumm=["Sex"]

for col in Get_Dumm :
    Encoded_with_getdummies=pd.get_dummies(df[col],drop_first=False,dtype="int")
    df=pd.concat([df,Encoded_with_getdummies],axis=1)
    df.drop(col,axis=1,inplace=True)

In [None]:
#Label Encoding 
Label_categ=["Embarked"]
for col in Label_categ :
    df[col]=LabelEncoder().fit_transform(df[col])

In [None]:
df

## Scaling Numerical data 

In [None]:
Scaled_Data=MinMaxScaler().fit_transform(df)

In [None]:
Scaled_Data

In [None]:
Scaled_Data=pd.DataFrame(Scaled_Data)

In [None]:
Scaled_Data

In [None]:
Scaled_Data.columns=["Survived","Pclass","Age","SibSp","Parch","Fare","Embarked","female","male"]

In [None]:
Scaled_Data

### Splitting Data into features and Goal Groups

In [None]:
Features=Scaled_Data.drop(columns="Survived")
Goal=Scaled_Data["Survived"]

### Splitting Groups into train and test groups :

In [None]:
Features_train,Features_test,Goal_train,Goal_test=train_test_split(Features,Goal,test_size=0.1,random_state=0)

## Applying Grid Search Model :

In [None]:
LR_params={"penalty":['l1', 'l2','elasticnet', 'None'],
           "dual":[True,False],
           "tol":[1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e-0,1e+1,1e+2,1e+3,1e+4,1e+5],
           "solver":['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

KNN_params={"n_neighbors":[3,4,5,6,7,8,9,10],
            "algorithm":['auto', 'ball_tree', 'kd_tree', 'brute'],
            "weights":["uniform","distance"],
            "leaf_size":[10,20,30,40,50]}

DTC_params={"max_leaf_nodes":[1,3,4,5,7,8,"None"],
            "random_state":[0,10,15,30,40,42,44,46,50,60],
            'max_depth' : [3,5,7,10],
            'criterion': ['gini', 'entropy']}

RFC_params={"n_estimators":[100,120,160,200,240],
            "criterion":['gini', 'entropy', 'log_loss'],
            "max_features":["sqrt","log2","None"],
            "bootstrap":[True,False]}

## Applying Grid Search on Logistic Regression model :

In [None]:
# Define parameter grid with solver-penalty compatibility
LR_params = {
    'penalty': ['l1', 'l2'],  # Include only valid penalties for respective solvers
    'solver': ['saga', 'liblinear'],  # These solvers support 'l1'
    'tol': [0.1, 0.01],
    'C': [1, 10, 100]
}

LR = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
GRS = GridSearchCV(LR, LR_params, cv=5, error_score='raise')  # Debugging with 'raise'
GRS.fit(Features_train, Goal_train)

print(GRS.best_params_)
print(GRS.best_score_)

In [None]:
LR=LogisticRegression(dual=False,penalty="l2",solver="saga",tol=0.1)
LR.fit(Features_train,Goal_train)
LR_Pred=LR.predict(Features_test)
LR_Pred_Train=LR.predict(Features_train)

In [None]:
print("Classification Report for the test group : \n")
print(classification_report(Goal_test,LR_Pred),"\n\n")
print("Classification Report for the train group (to check if there is any overfitting): \n")
print(classification_report(Goal_train,LR_Pred_Train),"\n\n")

In [None]:
CM_RFC=confusion_matrix(Goal_test,LR_Pred)
print(CM_RFC)
sns.heatmap(CM_RFC,annot=True,cmap="viridis")
plt.show()

## Applying Grid Search on KNN model 

In [None]:
KNN=KNeighborsClassifier()
GRS = GridSearchCV(KNN, KNN_params, cv = 5)
GRS.fit(Features_train, Goal_train)

print(GRS.best_params_)
print(GRS.best_score_)

In [None]:
KNN=KNeighborsClassifier(algorithm="auto",leaf_size=10,n_neighbors=8,weights="uniform")
KNN.fit(Features_train,Goal_train)
KNN_Pred=KNN.predict(Features_test)
KNN_Pred_Train=KNN.predict(Features_train)

In [None]:
print("Classification Report for the test group : \n")
print(classification_report(Goal_test,KNN_Pred),"\n\n")
print("Classification Report for the train group (to check if there is any overfitting): \n")
print(classification_report(Goal_train,KNN_Pred_Train),"\n\n")

In [None]:
CM_KNN=confusion_matrix(Goal_test,KNN_Pred)
print(CM_KNN)
sns.heatmap(CM_KNN,annot=True,cmap="viridis")
plt.show()

## Applying Grid Search on Decision Tree model :

In [None]:
# Define the parameter grid
DTC_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, None],
    'max_leaf_nodes': [None, 5, 10, 20],  # Avoid conflicts with max_depth
    'random_state': [0]
}

# Initialize the DecisionTreeClassifier
DTC = DecisionTreeClassifier()

# Apply GridSearchCV
GRS = GridSearchCV(DTC, DTC_params, cv=5, error_score='raise')  # Debugging mode
GRS.fit(Features_train, Goal_train)

# Print the best parameters and score
print(GRS.best_params_)
print(GRS.best_score_)

In [None]:
DTC = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=4, min_samples_split=2)
DTC.fit(Features_train, Goal_train)
DTC_Pred = DTC.predict(Features_test)
DTC_Pred_Train = DTC.predict(Features_train)

In [None]:
print("Classification Report for the test group : \n")
print(classification_report(Goal_test,DTC_Pred),"\n\n")
print("Classification Report for the train group (to check if there is any overfitting): \n")
print(classification_report(Goal_train,DTC_Pred_Train),"\n\n")

In [None]:
CM_DTC=confusion_matrix(Goal_test,DTC_Pred)
print(CM_DTC)
sns.heatmap(CM_DTC,annot=True,cmap="viridis")
plt.show()

## Applying Grid Search on Random Forest model :

In [None]:
# Define the parameter grid
RFC_params = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [5, 10, 20, None],  # Depth of the tree
    'max_features': ['sqrt', 'log2', None],  # Ensure valid values for max_features
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'random_state': [0]
}

# Initialize the RandomForestClassifier
RFC = RandomForestClassifier()

# Apply GridSearchCV
GRS = GridSearchCV(RFC, RFC_params, cv=5, error_score='raise')  # Debugging mode
GRS.fit(Features_train, Goal_train)

# Print the best parameters and score
print(GRS.best_params_)
print(GRS.best_score_)

In [None]:
RFC=RandomForestClassifier(bootstrap= True,class_weight='balanced_subsample', criterion="log_loss" ,max_features="log2",n_estimators= 120)
RFC.fit(Features_train,Goal_train)
RFC_Pred=RFC.predict(Features_test)
RFC_Pred_Train=RFC.predict(Features_train)

In [None]:
print("Classification Report for the test group : \n")
print(classification_report(Goal_test,RFC_Pred),"\n\n")
print("Classification Report for the train group (to check if there is any overfitting): \n")
print(classification_report(Goal_train,RFC_Pred_Train),"\n\n")

In [None]:
CM_RFC=confusion_matrix(Goal_test,RFC_Pred)
print(CM_RFC)
sns.heatmap(CM_RFC,annot=True,cmap="viridis")
plt.show()

In [None]:
feature_scores = pd.Series(RFC.feature_importances_,
                          index = Features_train.columns).sort_values(ascending = False)

sns.barplot(x = feature_scores, y = feature_scores.index)
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.show()

In [None]:
Ftrain_RFC = Features_train.drop(columns=["Embarked", "Parch"], axis=1)

# Инициализация модели
RFC_ii = RandomForestClassifier()

# Обновленные параметры
RFC_Params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800],
    "criterion": ["gini", "entropy", "log_loss"],
    "bootstrap": [True, False],
    "class_weight": ["balanced", "balanced_subsample"],
    "max_features": ["sqrt", "log2", None]  # None без кавычек
}

# GridSearchCV
GRS = GridSearchCV(RFC_ii, RFC_Params, cv=5, error_score='raise')  # Debugging mode включен
GRS.fit(Ftrain_RFC, Goal_train)

# Вывод лучших параметров и оценки
print("Лучшие параметры:", GRS.best_params_, "\n")
print("Лучшая оценка:", GRS.best_score_)


In [None]:
# Предобработка данных
Ftest_RFC = Features_test.drop(columns=["Embarked", "Parch"], axis=1)

# Инициализация и обучение модели
RFC_ii = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="gini")
RFC_ii.fit(Ftrain_RFC, Goal_train)

# Предсказания
RFC_Pred = RFC_ii.predict(Ftest_RFC)
RFC_Pred_Train = RFC_ii.predict(Ftrain_RFC)

In [None]:
print("Classification Report for the test group : \n")
print(classification_report(Goal_test,RFC_Pred),"\n\n")
print("Classification Report for the train group (to check if there is any overfitting): \n")
print(classification_report(Goal_train,RFC_Pred_Train),"\n\n")

In [None]:
CM_RFC=confusion_matrix(Goal_test,RFC_Pred)
print(CM_RFC)
sns.heatmap(CM_RFC,annot=True,cmap="viridis")
plt.show()

## Choose the final Model :

"""Based on the evaluation of various models, the Random Forest model, trained on the entire feature set, emerges as the 
most effective. It demonstrates a remarkable accuracy of 98% in predicting outcomes within the training dataset. 
Accompanying this high accuracy is an F1 score of 98%, reflecting the model's precision and recall performance metrics.Upon 
application to the testing dataset, the Random Forest model maintains a commendable accuracy rate of 78%, indicating its 
robustness in generalizing to unseen data. A meticulous examination of the confusion matrix further underscores the model's efficacy, as evidenced by the notable proportion of correctly predicted instances.
Specifically, the Random Forest model accurately predicts 69 out of 89 instances, yielding an impressive true positive rate. 
Conversely, only 20 instances are misclassified, reaffirming the model's overall proficiency in classification tasks."""

### Saving Encoded And Scaled Data as CSV file 

In [None]:
df.to_csv("Encoded_titanic_data.csv")
Scaled_Data.to_csv("Scaled_titanic_data.csv")  

### Saving the model :

In [None]:
joblib.dump(RFC,"Titanic Classifier Model (IEE Task).sav")