In [22]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [30]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df['Age'].fillna(train_df['Age'].mode()[0], inplace=True)
train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Feature Engineering
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

train_df.head()
train_df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)


# Encoding categorical variables
label_encoder = LabelEncoder()
train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])       # Male=1, Female=0
# train_df['Title'] = label_encoder.fit_transform(train_df['Title'])
# test_df['Title'] = label_encoder.transform(test_df['Title'])

# One-Hot Encode 'Embarked'
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)

# Dropping non-informative columns
train_df.drop(['PassengerId', 'Name'], axis=1, inplace=True)

# Prepare data for training
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']


# Feature Scaling
scaler = StandardScaler()
X_train[['Fare', 'Age']] = scaler.fit_transform(X_train[['Fare', 'Age']])


test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

# Encoding categorical variables in test set
test_df['Sex'] = label_encoder.transform(test_df['Sex'])  # Male=1, Female=0
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)


test_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [39]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Combine datasets for consistent preprocessing
combine = [train_df, test_df]



# Fill missing 'Age' and 'Fare' values ; 'Embarked' values
for dataset in combine:
    dataset['Age'].fillna(dataset['Age'].mode()[0], inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].mode()[0], inplace=True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)


# Extract Titles from Names
for dataset in combine:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


# Simplify titles
title_mapping = {
    "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7, "Major": 7, "Mlle": 8, 
    "Countess": 9, "Ms": 2, "Lady": 9, "Jonkheer": 10, "Don": 11, "Dona": 11, "Mme": 3, "Capt": 7, "Sir": 11}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping).fillna(0)


# Feature engineering ; additional DOMAIN SPECIFIC features and 
for dataset in combine:
    dataset['FamilySize']     = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['FamilySize_and_Age'] = dataset['FamilySize'] * dataset['Age']
    dataset['Pclass_Fare'] = dataset['Pclass'] * dataset['Fare']
    dataset['Pclass_Age'] = dataset['Pclass'] * dataset['Age']
    dataset['Fare_per_Person'] = dataset['Fare'] / dataset['FamilySize']
    dataset['is_Alone'] = np.where((dataset['SibSp'] == 0) & (dataset['Parch'] == 0), 1, 0)

# Drop unnecessary columns
train_df.drop(['PassengerId', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
test_df.drop (['PassengerId', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)



# Convert categorical features to numeric
label_encoder = LabelEncoder()


for dataset in combine:
    dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])
    
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)
test_df  = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)



# Feature scaling
scaler = StandardScaler()
train_df[['Fare', 'Age']] = scaler.fit_transform(train_df[['Fare', 'Age']])
test_df[['Fare', 'Age']] = scaler.transform(test_df[['Fare', 'Age']])

# Split data for training
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

## Hyperparameter Tuning and Training - Random Forest

In [40]:
# Random Forest Classifier with GridSearchCV
rf = RandomForestClassifier(random_state=42)

# Best parameters for Random Forest: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300} => 81.64
# Best parameters for Random Forest: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 500} => 81.56
# Best parameters for Random Forest: {'max_depth': 3, 'min_samples_leaf': 6, 'min_samples_split': 15, 'n_estimators': 700} =>

# Define hyperparameters to tune
rf_params = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6]
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best parameters
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Final model with best parameters
best_rf = grid_search_rf.best_estimator_

Best parameters for Random Forest: {'max_depth': 3, 'min_samples_leaf': 6, 'min_samples_split': 15, 'n_estimators': 700}


## Hyperparameter Tuning and Training - XGBoost

In [41]:
# XGBoost Classifier with GridSearchCV
xgb_model = XGBClassifier(random_state=42)

# Define hyperparameters to tune
xgb_params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.6, 0.8]
}
# Best parameters for XGBoost: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.6} => 81.64
# Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

# Perform GridSearchCV
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=xgb_params, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# Best parameters
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)

# Final model with best parameters
best_xgb = grid_search_xgb.best_estimator_

Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [45]:
# Print the best scores
print(f"Random Forest Best Score: {grid_search_rf.best_score_}")
print(f"XGBoost Best Score: {grid_search_xgb.best_score_}")

Random Forest Best Score: 0.828612232837585
XGBoost Best Score: 0.832847434255885


In [46]:
# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define base models
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
xgboost = XGBClassifier(random_state=42)

# Define a meta model (final classifier)
meta_model = LogisticRegression(C=1, random_state=42)


# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost)
    ],
    final_estimator=meta_model,
    cv=5
)

# Fit the model
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_val)

# Evaluate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Make predictions on test set
test_predictions = stacking_clf.predict(test_df)

# Save predictions for submission
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'], 
    'Survived': test_predictions
})
submission.to_csv('StackingMethode_submission_Is_ALone_Feature.csv', index=False)

Validation Accuracy: 0.6842


In [47]:
# Predict on test data using Random Forest, XGBoost
rf_predictions = best_rf.predict(test_df)
xgb_predictions = best_xgb.predict(test_df)

submission_rf = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': rf_predictions
})

submission_xgb = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': xgb_predictions
})

submission_rf.to_csv('submission_Random_Forst_TItle_5.csv', index=False)
submission_xgb.to_csv('submission_XGBoost_TItle_5.csv', index=False)

## Decision Tree

In [7]:
# Decision Tree with GridSearchCV
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 7, 10, 20, 30, 50 ],  # Explore deeper trees
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4 ],
    'max_features': ['sqrt', 'log2', None],  # Trying different max_features for random selection
}

# Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}


# Gave 79.88% accuracy
# params = {
#     'max_depth': [3, 5, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 5],
#     'criterion': ['gini', 'entropy']
# }

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
print("Best Parameters:", grid_search.best_params_)


# Training model with the best parameters
best_model = grid_search.best_estimator_

# Predict on test set
X_test = test_df
y_pred = best_model.predict(X_test)

X_test[['Fare', 'Age']] = scaler.transform(X_test[['Fare', 'Age']])

submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': y_pred
})

submission.to_csv('./submissions/submission_4_feature_engineering.csv', index=False)

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [3]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Generate submission.csv
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': y_pred
})

submission.to_csv('Random_Forest_submission_2.csv', index=False)

In [10]:
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

# Generate submission.csv
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Survived': y_pred
})

submission.to_csv('XG_Boost_submission_3.csv', index=False)