In [None]:
# import libraries...
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans


In [3]:
# load datasets...

df = pd.read_csv("titanic.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [21]:
# first 5 rows..
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [4]:
# drop unwanted columns   
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
# Fill missing values using mean or median or mode..
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


In [22]:
# find the no.of cols where having the null values...
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [7]:
# Encode categorical variables using Label Encoder...

label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for column in categorical_cols:
    df[column] = label_encoder.fit_transform(df[column])

In [10]:
# Features and target
x = df.drop('Survived', axis=1)
y = df['Survived']

x,y

(     Pclass  Sex        Age  SibSp  Parch     Fare  Embarked
 0         3    1  22.000000      1      0   7.2500         2
 1         1    0  38.000000      1      0  71.2833         0
 2         3    0  26.000000      0      0   7.9250         2
 3         1    0  35.000000      1      0  53.1000         2
 4         3    1  35.000000      0      0   8.0500         2
 ..      ...  ...        ...    ...    ...      ...       ...
 886       2    1  27.000000      0      0  13.0000         2
 887       1    0  19.000000      0      0  30.0000         2
 888       3    0  29.699118      1      2  23.4500         2
 889       1    1  26.000000      0      0  30.0000         0
 890       3    1  32.000000      0      0   7.7500         1
 
 [891 rows x 7 columns],
 0      0
 1      1
 2      1
 3      1
 4      0
       ..
 886    0
 887    1
 888    0
 889    1
 890    0
 Name: Survived, Length: 891, dtype: int64)

In [11]:
# Feature scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [13]:
# split the data sets into 80% training and 20% testing..

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 1. Multiple Linear Regression

mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)
mlr_pred = mlr_model.predict(X_test)

mlr_pred_class = np.round(mlr_pred).astype(int)
mlr_acc = accuracy_score(y_test, mlr_pred_class)

print("\nMultiple Linear Regression Classification Report:")
print(classification_report(y_test, mlr_pred_class))


Multiple Linear Regression Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       105
           1       0.77      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [15]:
# 2. Decision Tree

dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)
dt_pred = dt_model.predict(x_test)
dt_acc = accuracy_score(y_test, dt_pred)

print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_pred))


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       105
           1       0.76      0.76      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [16]:
# 3. K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)
knn_pred = knn_model.predict(x_test)
knn_acc = accuracy_score(y_test, knn_pred)

print("\nK-Nearest Neighbors Classification Report:")
print(classification_report(y_test, knn_pred))


K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [17]:
# 4. Naive Bayes
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
nb_pred = nb_model.predict(x_test)
nb_acc = accuracy_score(y_test, nb_pred)

print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.80       105
           1       0.71      0.77      0.74        74

    accuracy                           0.78       179
   macro avg       0.77      0.78      0.77       179
weighted avg       0.78      0.78      0.78       179



In [18]:
# 5. Support Vector Machine
svm_model = SVC()
svm_model.fit(x_train, y_train)
svm_pred = svm_model.predict(x_test)
svm_acc = accuracy_score(y_test, svm_pred)

print("\nSupport Vector Machine Classification Report:")
print(classification_report(y_test, svm_pred))


Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.72      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179



In [19]:
# 6. Clustering (unsupervised)
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
kmeans.fit(x_scaled)
cluster_labels = kmeans.labels_

from sklearn.metrics import adjusted_rand_score
clustering_score = adjusted_rand_score(y, cluster_labels)

print(f"\nKMeans Clustering (Adjusted Rand Index): {clustering_score:.4f}")



KMeans Clustering (Adjusted Rand Index): 0.1171


In [20]:
# Print model accuracy scores...

print("\nModel Accuracy Scores:")
print(f"Multiple Linear Regression (rounded): {mlr_acc:.4f}")
print(f"Decision Tree: {dt_acc:.4f}")
print(f"K-Nearest Neighbors: {knn_acc:.4f}")
print(f"Naive Bayes: {nb_acc:.4f}")
print(f"Support Vector Machine: {svm_acc:.4f}")



Model Accuracy Scores:
Multiple Linear Regression (rounded): 0.7933
Decision Tree: 0.7989
K-Nearest Neighbors: 0.7989
Naive Bayes: 0.7765
Support Vector Machine: 0.8156
