In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns


# Load the Titanic train and test datasets
train = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
test = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

#.shape shows number of rows and colums in dataset
print("train shape:", train.shape) 
print("test shape:", test.shape)

#shows column names, non-null values,data types
train.info()
 
print("\nMissing values (train):\n", train.isnull().sum())    #counts sum of null values
print("\nSurvival value counts:\n", train['Survived'].value_counts(normalize=True))  
#train['Survived'] → Selects the target column (0 = died, 1 = survived).
#.value_counts(normalize=True) → Shows percentage distribution instead of raw counts.
#Helps detect class imbalance.

#VISUALISE SURVIVAL COUNTS
sns.countplot(x='Survived', data=train)
plt.title('Survived counts')
plt.show()
#sns.countplot → Makes a bar chart showing count of passengers who survived vs. didn’t survive.
#plt.title() → Adds a title.
#.show() → Displays the plot.

#VISUALISE SURVIVAL RATE BY GENDER
sns.countplot(x='Sex',hue='Survived',data=train)

#AGE DISTRIBUTION BY PASSENGER CLASS
sns,boxplot(x='Pclass',y='Age',data=train)
plt.title('Age by Pclass')
plt.show()

#DATA PREPROCESSING - FEATURE ENGINEERING
# Step 3 - simple cleaning
train2 = train.copy()
test2 = test.copy()

# Save PassengerId for final submission
test_passenger_ids = test2['PassengerId']

# Fill common missing values (simple baseline)
train2['Age'].fillna(train2['Age'].median(), inplace=True)
test2['Age'].fillna(test2['Age'].median(), inplace=True)
train2['Embarked'].fillna(train2['Embarked'].mode()[0], inplace=True)
test2['Fare'].fillna(test2['Fare'].median(), inplace=True)

# Drop Cabin (lots missing) and Name/Ticket/PassengerId for model input
train2.drop(['Cabin'], axis=1, inplace=True)
test2.drop(['Cabin'], axis=1, inplace=True)

full = pd.concat([train2.assign(dataset='train'), test2.assign(dataset='test')], sort=False)
#concats both datasets and assigns which is trained and which is test, sort=false keeps it in same order without sorting

full['Title'] = full['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
#Goal: Captures social title, which is correlated with age, gender, and survival odds.

# Map rare titles to common groups
#Prevents very small categories that could confuse the model.
title_map = {
    'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs',
    'Lady':'Rare','Countess':'Rare','Capt':'Rare','Col':'Rare','Don':'Rare','Dr':'Rare','Major':'Rare',
    'Rev':'Rare','Sir':'Rare','Jonkheer':'Rare','Dona':'Rare'
}
full['Title'] = full['Title'].replace(title_map)

#
full['FamilySize'] = full['SibSp'] + full['Parch'] + 1  
full['IsAlone'] = (full['FamilySize']==1).astype(int)
#SibSp = siblings/spouses aboard, Parch = parents/children aboard.
#+1 includes the passenger themselves.
#IsAlone = binary flag → 1 if no family aboard, else 0.

full['Age'] = full.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
#Groups passengers by Title (e.g., Mr, Miss, Rare).
#Fills missing ages with the median age for that title.

full['Fare'] = full.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))

#Turns each category into binary columns (0/1).
#drop_first=True → Avoids multicollinearity by dropping one category from each set.
#Example: Sex → only Sex_male (female implied when 0).
full = pd.get_dummies(full, columns=['Sex','Embarked','Title'], drop_first=True)

#drop unused non numeric colums
full.drop(['Name','Ticket','PassengerId','dataset'],axis=1,inplace=True)

train_fe=full.loc[full.loc[full['Survived'].notnull()].copy()]
test_fe = full.loc[full['Survived'].isnull()].drop('Survived', axis=1).copy()
#train_fe: Rows with Survived value (original training set).
#test_fe: Rows without Survived (original test set).
#Drop Survived from test since we’ll predict it.

X = train_fe.drop('Survived', axis=1)
y = train_fe['Survived'].astype(int)
#X: All columns used to predict survival and drop the actual survival column ie features
#y:the actual survival column 

print("X shape:", X.shape)
print("test_fe shape:", test_fe.shape)

#ACTUAL MODEL TRAINING
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score, classification_report, confusion_matrix

#splitting data into training and validation
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

#LOGISTIC REGRESSION
lr=LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train,y_train)
print("LR val acc:", accuracy_score(y_val, lr.predict(X_val)))

#RANDOM FOREST
rf=RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
yhat = rf.predict(X_val)

#accuracy score
print("RF val acc:", accuracy_score(y_val, yhat)) 
print(classification_report(y_val,yhat))

#CROSS VALIDATION
from sklearn.model_selection import cross_val_score
#purpose:to check how well random forest model works to detect overfitting/underfitting
print("RF CV mean:", cross_val_score(rf, X, y, cv=5).mean())
cross_val_score → Performs cross-validation.

#rf → This is your Random Forest model (already trained or ready to train).
#X → Features (input data without the target).
#y → Target variable (Survived in Titanic).
#cv=5 → 5-fold cross-validation:
   #The dataset is split into 5 equal parts.
   #The model trains on 4 parts and tests on the remaining 1 part.
   #This repeats 5 times so every part gets tested once.
   #.mean() → Takes the average of the 5 test scores to give a more reliable accuracy measure than a single train-test split.

importances=pd.Series(rf.feature_importances_,index=X.columns).sort_values(ascending=False))
#rf.feature_importances_ → After training, Random Forest can tell you how important each feature was for making predictions.
#pd.Series(..., index=X.columns) → Makes a Pandas Series where:
   #Values = importance scores from the model
   #Index = feature names
   #.sort_values(ascending=False) → Sorts so the most important feature is at the top.

test_fe_aligned = test_fe.reindex(columns=X.columns, fill_value=0)
#The test DataFrame has exactly the same columns as X (training features).
#If a column is missing, it fills it with 0 (meaning that category did not occur for that row).

#training final model
final_model = RandomForestClassifier(n_estimators=200, random_state=42)
final_model.fit(X, y)

pred_test = final_model.predict(test_fe_aligned)


#final result whether passenger survived or not
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': pred_test.astype(int)
})
submission.to_csv('submission.csv', index=False)
print("Saved submission.csv — upload or submit from Kaggle notebook UI")




