In [2]:
from google.colab import files

# Upload the train.csv file
train_file = files.upload()
if 'train.csv' in train_file:
    print("train.csv uploaded successfully.")
else:
    print("Failed to upload train.csv.")

# Upload the test.csv file
test_file = files.upload()
if 'test.csv' in test_file:
    print("test.csv uploaded successfully.")
else:
    print("Failed to upload test.csv.")

# Upload the gender_submission.csv file
submission_file = files.upload()
if 'gender_submission.csv' in submission_file:
    print("gender_submission.csv uploaded successfully.")
else:
    print("Failed to upload gender_submission.csv.")

Saving train.csv to train (1).csv
Failed to upload train.csv.


Saving test.csv to test.csv
test.csv uploaded successfully.


Saving gender_submission.csv to gender_submission.csv
gender_submission.csv uploaded successfully.


In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Read the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Feature Engineering
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

train_df['LogFare'] = np.log1p(train_df['Fare'])
test_df['LogFare'] = np.log1p(test_df['Fare'])

train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify titles
title_mapping = {
    "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6,
    "Col": 7, "Major": 8, "Mlle": 2, "Countess": 3, "Ms": 2,
    "Lady": 3, "Jonkheer": 3, "Don": 1, "Dona": 3, "Sir": 1,
    "Capt": 7, "Mme": 3
}
train_df['Title'] = train_df['Title'].map(title_mapping)
test_df['Title'] = test_df['Title'].map(title_mapping)

# Fill missing values in categorical columns
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

# Fill missing values in numerical columns
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

# Drop unnecessary columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare']
train_df_cleaned = train_df.drop(columns=columns_to_drop, axis=1)
test_df_cleaned = test_df.drop(columns=columns_to_drop, axis=1)

# One-hot encode categorical features
train_df_cleaned = pd.get_dummies(train_df_cleaned, columns=['Embarked', 'Sex'], drop_first=True)
test_df_cleaned = pd.get_dummies(test_df_cleaned, columns=['Embarked', 'Sex'], drop_first=True)

# Ensure columns match between train and test datasets
missing_cols = set(train_df_cleaned.columns) - set(test_df_cleaned.columns)
for col in missing_cols:
    test_df_cleaned[col] = 0
test_df_cleaned = test_df_cleaned[train_df_cleaned.columns.drop('Survived')]

# Separate features and target
X = train_df_cleaned.drop('Survived', axis=1)
y = train_df_cleaned['Survived']

# Handle missing values in numerical columns
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)
test_df_cleaned = imputer.transform(test_df_cleaned)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale numerical features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(test_df_cleaned)

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42
)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gb_scores = []
for train_idx, val_idx in kfold.split(X_resampled, y_resampled):
    X_train, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_train, y_val = y_resampled[train_idx], y_resampled[val_idx]
    gb_model.fit(X_train, y_train)
    gb_scores.append(accuracy_score(y_val, gb_model.predict(X_val)))

print("Gradient Boosting Average Accuracy:", np.mean(gb_scores))

# Train on full data and make predictions
gb_model.fit(X_resampled, y_resampled)
test_predictions = gb_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv('submission.csv', index=False)

# Download submission file
from google.colab import files
files.download('submission.csv')


Gradient Boosting Average Accuracy: 0.8442382731423829


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>