In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score



### Loading The Data

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


### Exploring Patterns in data

In [3]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)
print("% of women who survived:", rate_women)

men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)
print("% of men who survived:", rate_men)


% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


### Data Processing 

In [4]:
# 🧹 Data Cleaning and Feature Engineering
train_df = train_data.copy()
test_df = test_data.copy()
combine = [train_df, test_df]

# Encode 'Sex' column
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1})

# Fill and encode 'Embarked'
for dataset in combine:
    dataset['Embarked'].fillna('S', inplace=True)
embarked_encoder = LabelEncoder()
train_df['Embarked'] = embarked_encoder.fit_transform(train_df['Embarked'])
test_df['Embarked'] = embarked_encoder.transform(test_df['Embarked'])

# Fill missing 'Age' and 'Fare'
for dataset in combine:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Create new features
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Embarked'].fillna('S', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

### Extract and Encode “Title” from Name

In [5]:
# Extract Title from Name
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Replace rare and similar titles
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                  'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                                  'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Encode title
title_encoder = LabelEncoder()
train_df['Title'] = title_encoder.fit_transform(train_df['Title'])
test_df['Title'] = title_encoder.transform(test_df['Title'])


### Define Features & Labels

In [6]:
'''features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "IsAlone", "Title"]
X = train_df[features]
y = train_df["Survived"]
X_test = test_df[features]'''


# Add new features first (if not already present)
train_df["FarePerPerson"] = train_df["Fare"] / (train_df["FamilySize"] + 1)
test_df["FarePerPerson"] = test_df["Fare"] / (test_df["FamilySize"] + 1)

train_df["Age*Class"] = train_df["Age"] * train_df["Pclass"]
test_df["Age*Class"] = test_df["Age"] * test_df["Pclass"]

# Final feature list
features = [
    "Pclass", "Sex", "Age", "Fare", "Embarked",
    "FamilySize", "IsAlone", "Title", "SibSp", "Parch",
    "FarePerPerson", "Age*Class"
]

X = train_df[features]
y = train_df["Survived"]
X_test = test_df[features]




### 6. 🤖 Train Model with Cross Validation

In [7]:
model = XGBClassifier(n_estimators=300, learning_rate=0.5, max_depth=5, use_label_encoder=False, eval_metric='logloss')
scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validation accuracy: {scores.mean():.4f}")


Cross-validation accuracy: 0.8227


### 7. 🧠 Train on Full Data and Predict

In [8]:
model.fit(X, y)
predictions = model.predict(X_test)


## 🔹  8. 💾 Create Submission File

In [9]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
