In [None]:
#step 1 eda
import pandas as pd
import numpy as np

train_path = "C:/Users/PIYUSH PATEL/Downloads/titanic_disaster_dataset/train.csv"
test_path = "C:/Users/PIYUSH PATEL/Downloads/titanic_disaster_dataset/test.csv"


train_df= pd.read_csv(train_path)
test_df=pd.read_csv(test_path)

print(train_df.head())
print(train_df.info())

# 📊 Step 1: Exploratory Data Analysis (EDA)
# 🔍 Dataset Overview
# The train dataset has 891 entries with 12 columns.
# The Survived column (0 = No, 1 = Yes) is our target variable.
# The dataset contains both numerical & categorical data.
# 📌 Key Observations
# Missing values:
# Age has 177 missing values.
# Cabin has many missing values (687 out of 891).
# Embarked has 2 missing values.
# Important Columns:
# Pclass, Sex, Age, SibSp, Parch, Fare, and Embarked seem useful for prediction.
# Name, Ticket, and Cabin might not be useful directly.



In [None]:
# ✅ Next Step: Data Cleaning
# We'll handle missing values and drop unnecessary columns. Let's clean the data! 🚀 

# filling missing age with the value median in age average nikal ke dal denge
train_df["Age"].fillna(train_df["Age"].median(),inplace=True)

#filling embark with most common category
train_df["Embarked"].fillna(train_df["Embarked"].mode()[0],inplace=True)
#mode ke age 0 likha hai kyunki 0 likhne se sabse pehla common element ko lelega like agar 1 likhta toh second most common element ko leleta jo bhi embarked column mai hota

#drop cabin and ticket column cabin isliye kyunki usme bohot missing hai and ticket sabka alag hoga jo jyda contribute nhi krega aur ese data ko ml model mai useless hote hai so isko drp krdenge
train_df.drop(columns=["Cabin","Ticket"],inplace=True)

#verify that missing values are handeled
missing_values=train_df.isnull().sum()

missing_values




In [None]:
# 🔥 Next Step: Feature Engineering
# We need to:

# Convert categorical data (Sex, Embarked) into numerical format.
# Create new useful features (e.g., FamilySize = SibSp + Parch).
train_df["Sex"]=train_df["Sex"].map({"male":0,"female":1})
train_df["Embarked"]=train_df["Embarked"].map({"S":0,"C":1,"Q":2})

#naya feature banayenge jisme 2 colom ko add krdenge like sibsp iska matlab Number of Siblings/Spouses Aboard aur Number of parents/children the passenger had on board.
# isme humlog sari famil memebers ko add krdenge

train_df["Familysize"]=train_df["SibSp"]+train_df["Parch"]

#drop name and passenger id ye apne ml model mai kisi kaam ki nahi
train_df.drop(columns=["Name","PassengerId"],inplace=True)

train_df.head()

# We have successfully transformed the dataset:
# ✅ Encoded categorical variables:

# Sex: (Male → 0, Female → 1)
# Embarked: (S → 0, C → 1, Q → 2)
# ✅ Created a new feature:

# FamilySize = SibSp + Parch (total number of family members onboard).
# ✅ Dropped unnecessary columns:

# Name, PassengerId (not needed for ML).


In [None]:
# 🔥 Full Summary in One Shot
# 1️⃣ train_test_split → Data ka 80% training, 20% testing ke liye split karta hai.
# 2️⃣ StandardScaler → Values ko normalize karta hai so that sab ek scale pe ho.
# 3️⃣ LogisticRegression → Binary classification ke liye model import karta hai.
# 4️⃣ DecisionTreeClassifier → Tree-based classification model import karta hai.
# 5️⃣ accuracy_score & classification_report → Model ki performance check karne ke liye use hota hai.


# 💡 Simple Example Analogy
# 👉 Soch ek Cricket Match hai:

# train_test_split → Practice match aur Real match alag karta hai.
# StandardScaler → Sab players ko ek level pe practice karata hai.
# LogisticRegression → Simple decision leta hai (Win/Loss) based on stats.
# DecisionTreeClassifier → Step-by-step decision making karta hai (If-Else logic se).
# accuracy_score → Kitne sahi predict kiye, batting average jaisa.
# classification_report → Detailed stats deta hai jaise strike rate, boundaries, wickets.

In [None]:
# 🚀 Next Step: Model Training
# Now, we will:

# Split the data into training & testing sets.
# Train Logistic Regression & Decision Tree models.
# Evaluate their performance.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

print(train_df.columns)

#define feature (x) and target (y)
x = train_df.drop(columns=["Survived"])# input featuers (sab columns except 'survived') isme sab colum rakh raha hai bas survived ko nhi kyunki iski ko toh predict krna hai using other columns
y = train_df["Survived"]# Target variable (jo predict krna hai like 1 ki jaan bachi and 0 ki jaan nhi bachi)

#train-test Split
X_train,X_val,y_train,y_val = train_test_split(x,y, test_size=0.2,random_state=42)
#same split number rakhenge 42 random split mai isliye rakha kyunki 42 famous hai aur har baar ek hi ratio mai split hoga toh
# accuracy ek saman ayegi aur productivity badh jayegi

#standardize features (only for logistic regression)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
x_val_scaled = scaler.transform(X_val)

# train loggistic regressi0on method
log_reg = LogisticRegression()  #model ka object banaya 
log_reg.fit(X_train_scaled,y_train) # model ko train kiya (feateures and target pe)

#train decision tree model

dtree = DecisionTreeClassifier(max_depth=5,random_state=42)#model ka object
dtree.fit(X_train_scaled,y_train)#ensure model is trained
#max_depth=5 taki overfit na hojaye
#decission tree ek if else logic wala model hai

#model prediction
log_reg_preds = log_reg.predict(x_val_scaled)
dtree_preds = dtree.predict(X_val) #desicion tree predictions


#evaluate model
log_reg_acc = accuracy_score(y_val,log_reg_preds)# Logistic Regression Accuracy
dtree_acc = accuracy_score(y_val,dtree_preds)# Decision Tree Accuracy
# Check kar rahe hain ki model kitne sahi predictions kar raha hai

#detailed model evaluation 
log_reg_report = classification_report(y_val,log_reg_preds) #logisctic regression report
dtree_report=classification_report (y_val,dtree_preds) #decision tree report

print("Logistic Regression Accuracy:", log_reg_acc)
print("Decision Tree Accuracy:", dtree_acc)
print("Logistic Regression Report:\n", log_reg_report)
print("Decision Tree Report:\n", dtree_report)

# 🔥 Full Summary
# 1️⃣ Feature & Target Select → X = train_df.drop("Survived"), y = train_df["Survived"]
# 2️⃣ Train-Test Split → train_test_split(X, y, test_size=0.2)
# 3️⃣ Scaling (Only for Logistic Regression) → StandardScaler()
# 4️⃣ Logistic Regression Train → log_reg.fit(X_train_scaled, y_train)
# 5️⃣ Decision Tree Train → dtree.fit(X_train, y_train)
# 6️⃣ Predictions → log_reg.predict(), dtree.predict()
# 7️⃣ Accuracy Check → accuracy_score()
# 8️⃣ Detailed Report → classification_report()









In [None]:
# Load the test dataset
test_df = pd.read_csv(test_path)

# Perform the same preprocessing on test data as we did on train data
test_df["Age"].fillna(train_df["Age"].median(), inplace=True)
test_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)
test_df["Fare"].fillna(train_df["Fare"].median(), inplace=True)

# Convert categorical variables
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
test_df["Embarked"] = test_df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

# Create FamilySize feature
test_df["Familysize"] = test_df["SibSp"] + test_df["Parch"]

# Drop unnecessary columns
test_ids = test_df["PassengerId"]  # Save PassengerId for final submission
test_df.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"], inplace=True)

# Standardize test features for Logistic Regression
test_scaled = scaler.transform(test_df)

# Predict using Logistic Regression
log_reg_preds_test = log_reg.predict(test_scaled)

# Predict using Decision Tree
dtree_preds_test = dtree.predict(test_df)

# Create submission dataframe
submission_log_reg = pd.DataFrame({"PassengerId": test_ids, "Survived": log_reg_preds_test})
submission_dtree = pd.DataFrame({"PassengerId": test_ids, "Survived": dtree_preds_test})

# Display first few rows of the predictions
submission_log_reg.head(), submission_dtree.head()
