<a href="https://colab.research.google.com/github/Shadabur-Rahaman/30-days-ml-projects/blob/main/Day3_Titanic_Logistic_Regression_Cleaned.ipynb/notebooks/Day3_Titanic_Logistic_Regression_Cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 3: Titanic Survival Prediction with Logistic Regression

### 🧠 Goal:
Predict survival on the Titanic using logistic regression (Kaggle dataset).

In [None]:
# 📚 Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# 📂 Load Datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# 👀 Data Overview
print(train.head())
print(train.info())
print(train.describe())
print(train.isnull().sum())

In [None]:
# 🧹 Data Cleaning & Imputation
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
train = train.drop(['cabin'], axis=1, errors='ignore')

test['Age'].fillna(test['Age'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)
test = test.drop(['cabin'], axis=1, errors='ignore')

In [None]:
# 🔧 Feature Engineering
for dataset in [train, test]:
    dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1})
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [None]:
# 🎯 Feature Selection
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']
X = train[features]
y = train['Survived']

In [None]:
# 📐 Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test[features])

In [None]:
# 🧠 Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 🚂 Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# 🧪 Evaluation
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='d', cmap='Blues')

In [None]:
# 🔄 Cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5)
print("CV Accuracy: {:.2f} ± {:.2f}".format(cv_scores.mean(), cv_scores.std()))

In [None]:
# 📤 Kaggle Submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': model.predict(X_test_scaled)
})
submission.to_csv('submission.csv', index=False)

### ✅ Summary:
- Used Logistic Regression for binary classification
- Handled missing values and performed encoding
- Feature engineered family size
- Evaluated with accuracy, confusion matrix, cross-validation
- Ready for Kaggle submission!