In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score



In [2]:
# Load the dataset
data = pd.read_csv(r"D:\sonia\codsoft internship 2024\ML\datasets\titanic.csv")



In [14]:
print(data)

     Survived  Pclass  Sex   Age     Fare  Embarked  Title  FamilySize  \
0           0       3    1  22.0   7.2500         2      2           1   
1           1       1    0  38.0  71.2833         0      3           1   
2           1       3    0  26.0   7.9250         2      1           0   
3           1       1    0  35.0  53.1000         2      3           1   
4           0       3    1  35.0   8.0500         2      2           0   
..        ...     ...  ...   ...      ...       ...    ...         ...   
886         0       2    1  27.0  13.0000         2      4           0   
887         1       1    0  19.0  30.0000         2      1           0   
888         0       3    0  28.0  23.4500         2      1           3   
889         1       1    1  26.0  30.0000         0      2           0   
890         0       3    1  32.0   7.7500         1      2           0   

     IsAlone  
0          0  
1          0  
2          1  
3          0  
4          1  
..       ...  
886   

In [3]:
# Feature engineering before dropping any columns
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')



In [4]:
# Drop irrelevant features
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)



In [5]:
# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


In [6]:
# Feature engineering
data['FamilySize'] = data['SibSp'] + data['Parch']
data['IsAlone'] = (data['FamilySize'] == 0).astype(int)



In [7]:
# Dropping the original SibSp and Parch columns
data = data.drop(['SibSp', 'Parch'], axis=1)



In [8]:
# Encode categorical features
categorical_features = ['Sex', 'Embarked', 'Title']
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])



In [9]:
# Split data into features and target
X = data.drop(['Survived'], axis=1)
y = data['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [10]:
# Initialize and train individual models
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Ensemble model using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model), 
    ('gb', gb_model)
], voting='soft')


In [11]:

# Perform hyperparameter tuning for the ensemble model
param_grid_ensemble = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.05, 0.1]
}

grid_search_ensemble = GridSearchCV(estimator=ensemble_model, param_grid=param_grid_ensemble, cv=3, n_jobs=-1, verbose=2)
grid_search_ensemble.fit(X_train, y_train)



Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [12]:
# Best parameters from grid search
best_params_ensemble = grid_search_ensemble.best_params_
print(f"Best parameters for ensemble model: {best_params_ensemble}")

# Train the ensemble model with the best parameters
ensemble_model.set_params(**best_params_ensemble)
ensemble_model.fit(X_train, y_train)

# Predict on the test set with the tuned ensemble model
y_pred_ensemble = ensemble_model.predict(X_test)



Best parameters for ensemble model: {'gb__learning_rate': 0.1, 'gb__n_estimators': 100, 'rf__max_depth': 10, 'rf__n_estimators': 100}


In [13]:
# Calculate accuracy with the tuned ensemble model
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f'Tuned Ensemble Model Accuracy: {accuracy_ensemble:.3f}')

Tuned Ensemble Model Accuracy: 0.827
