In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('/content/Titanic-Dataset.csv')

In [None]:
imputer_age = SimpleImputer(strategy='mean')
data['Age'] = imputer_age.fit_transform(data[['Age']])

data = data.drop(columns=['Cabin'])

data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [None]:
data['Title'] = data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Officer', 'Rev': 'Officer', 'Col': 'Officer', 'Major': 'Officer',
    'Mlle': 'Miss', 'Mme': 'Mrs', 'Don': 'Royalty', 'Dona': 'Royalty', 'Lady': 'Royalty',
    'Countess': 'Royalty', 'Sir': 'Royalty', 'Jonkheer': 'Royalty', 'Capt': 'Officer'
}
data['Title'] = data['Title'].map(title_mapping)

In [None]:
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])
data['Title'] = label_encoder.fit_transform(data['Title'])

data['Fare'] = np.where(data['Fare'] > 300, 300, data['Fare'])
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare  Embarked  FamilySize  Title  
0         A/5 21171   7.2500         2           2      2  
1          PC 17599  71.2833         0           2      3  
2  STON/O2. 3101282   7.9250         2           1      1  
3            113803  53.1000         2           2      

In [None]:
#logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket'])
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.8101
Confusion Matrix:
[[90 15]
 [19 55]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
#K-Fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#cross-validation
cv_results = cross_val_score(logreg, X, y, cv=kfold, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {cv_results}')

Cross-Validation Accuracy Scores: [0.7877095  0.79213483 0.79213483 0.78651685 0.80898876]


Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket'])
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.8380
Confusion Matrix:
[[91 14]
 [15 59]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       105
           1       0.81      0.80      0.80        74

    accuracy                           0.84       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



Hyper parameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket'])
y = data['Survived']

rf_model = RandomForestClassifier(random_state=42)

#Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],             # Number of trees
    'max_depth': [10, 20, None],            # Depth of trees
    'min_samples_split': [2, 5],            # Minimum samples required to split
    'min_samples_leaf': [1, 2],             # Minimum samples required at each leaf node
    'bootstrap': [True, False]              # Whether bootstrap samples are used when building trees
}

rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                               n_iter=10, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)

rf_random.fit(X, y)

print(f'Best Parameters: {rf_random.best_params_}')
print(f'Best Accuracy: {rf_random.best_score_:.4f}')


Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 10, 'bootstrap': True}
Best Accuracy: 0.8316


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = data.drop(columns=['Survived', 'Name', 'PassengerId', 'Ticket'])
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning

# Step 3: Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],                # None means nodes are expanded until all leaves are pure
    'min_samples_split': [2, 5, 10],                   # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4, 6]                   # Minimum samples at a leaf node
}

dt_model = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Accuracy: {grid_search.best_score_:.4f}')

best_dt_model = grid_search.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)

# Accuracy score
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Accuracy on Test Set: {dt_accuracy:.4f}')

# Confusion matrix
dt_conf_matrix = confusion_matrix(y_test, y_pred_dt)
print('Confusion Matrix:')
print(dt_conf_matrix)

# Classification report
dt_class_report = classification_report(y_test, y_pred_dt)
print('Classification Report:')
print(dt_class_report)


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 2}
Best Accuracy: 0.8188
Accuracy on Test Set: 0.8045
Confusion Matrix:
[[94 11]
 [24 50]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       105
           1       0.82      0.68      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.79      0.79       179
weighted avg       0.81      0.80      0.80       179

