In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# 🚢 Titanic Survival Prediction – Improved Model

## 📌 Introduction
This project is part of the [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic) competition.  
The goal is to predict whether a passenger survived or not, based on features like age, gender, ticket class, fare, and embarkation port.  

I started with Kaggle’s beginner tutorial model and improved it by:
- Using additional features
- Handling missing values more effectively
- Tuning a Random Forest classifier for better accuracy

---

## 📂 Data Loading
The dataset contains:
- **train.csv** → 891 passengers (with survival labels)
- **test.csv** → 418 passengers (without survival labels)
- **gender_submission.csv** → Example file showing required submission format

Columns include:
- `Pclass` → Passenger class (1 = First, 3 = Third)
- `Sex` → Male/Female
- `Age` → Passenger age in years
- `SibSp` → Number of siblings/spouses aboard
- `Parch` → Number of parents/children aboard
- `Fare` → Ticket price
- `Embarked` → Port of embarkation (C, Q, S)

---

## 📊 Exploratory Data Analysis (EDA)
Quick observations:
- Women had a much higher survival rate (~74%) compared to men (~19%).
- Higher ticket class (`Pclass=1`) had higher survival rates.
- Missing values are present in `Age`, `Fare`, and `Embarked`.

---

## 🛠️ Feature Engineering
- Filled missing `Age` with median.
- Filled missing `Fare` with median.
- Filled missing `Embarked` with mode (most common value).
- Converted categorical variables (`Sex`, `Embarked`) into numeric format using one-hot encoding.

---

## 🤖 Model Training
- Algorithm: **RandomForestClassifier**  
- Parameters:
  - `n_estimators = 200`
  - `max_depth = 7`
  - `random_state = 42`
- Features used: `Pclass`, `Sex`, `SibSp`, `Parch`, `Age`, `Fare`, `Embarked`

---

## 📈 Results & Submission
- The model outputs survival predictions for passengers in **test.csv**.
- Submission file: `submission.csv` with columns:
  - `PassengerId`
  - `Survived` (1 = Survived, 0 = Did not survive)
- **Public Leaderboard Score**: 0.78

---


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [5]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [6]:
from sklearn.ensemble import RandomForestClassifier

# Target variable
y = train_data["Survived"]

# Feature set
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]

# Fill missing values
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].median())
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].median())

train_data["Fare"] = train_data["Fare"].fillna(train_data["Fare"].median())
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].median())

train_data["Embarked"] = train_data["Embarked"].fillna(train_data["Embarked"].mode()[0])
test_data["Embarked"] = test_data["Embarked"].fillna(test_data["Embarked"].mode()[0])

# One-hot encode categorical variables
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Train Random Forest with more depth & estimators
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=7,
    random_state=42
)
model.fit(X, y)

# Predictions
predictions = model.predict(X_test)

# Save to CSV for submission
output = pd.DataFrame({
    'PassengerId': test_data.PassengerId,
    'Survived': predictions
})
output.to_csv('submission.csv', index=False)
print("Your improved submission was successfully saved!")


Your improved submission was successfully saved!
