In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [13]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display basic info about the dataset
print(train_df.info())
print(test_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [14]:
# Combine datasets for easier preprocessing
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Handle missing values
combined_df['Age'].fillna(combined_df['Age'].median(), inplace=True)
combined_df['Fare'].fillna(combined_df['Fare'].median(), inplace=True)
combined_df['Embarked'].fillna(combined_df['Embarked'].mode()[0], inplace=True)

# Feature engineering: Extract titles from names
combined_df['Title'] = combined_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Mapping titles to categories
title_mapping = {
    'Mr': 'Mr',
    'Miss': 'Miss',
    'Mrs': 'Mrs',
    'Master': 'Master',
    'Dr': 'Other',
    'Rev': 'Other',
    'Col': 'Other',
    'Major': 'Other',
    'Mlle': 'Miss',
    'Countess': 'Other',
    'Ms': 'Miss',
    'Lady': 'Other',
    'Jonkheer': 'Other',
    'Don': 'Other',
    'Mme': 'Mrs',
    'Capt': 'Other',
    'Sir': 'Other'
}
combined_df['Title'] = combined_df['Title'].map(title_mapping)

# Encoding categorical variables
label_encoders = {}
for feature in ['Sex', 'Embarked', 'Title']:
    label_encoders[feature] = LabelEncoder()
    combined_df[feature] = label_encoders[feature].fit_transform(combined_df[feature])

# Drop unnecessary columns
combined_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Split back into train and test datasets
train_processed = combined_df.iloc[:len(train_df)]
test_processed = combined_df.iloc[len(train_df):].drop('Survived', axis=1)


In [15]:
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Define parameters for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform GridSearchCV to find best parameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Initialize model with best parameters
model = grid_search.best_estimator_

# Train the model
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate model performance
print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [None]:
# Predict on test set
test_predictions = model.predict(test_processed)

# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

# Save submission to CSV
submission.to_csv('titanic_submission.csv', index=False)
