# Titanic Survival Prediction
This notebook builds an end-to-end ML pipeline (preprocessing + RandomForest) to predict passenger survival.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report




In [None]:
# Load the data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# View first rows
train_data.head()


In [None]:
train_data.info()


In [None]:
train_data.describe()


In [None]:
#Separate features and target
X = train_data.drop(['Survived'],axis =1)
y = train_data.Survived 


In [None]:
# Identify column types
numerical_col = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_col = [col for col in X.columns if X[col].dtype=='object' ]

In [None]:
# split the data
X_train, X_valid,y_train,y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [None]:
# preprocessing
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
   ('onehot', OneHotEncoder(handle_unknown='ignore'))   
                                          ])

preprocessing =  ColumnTransformer(transformers=[('num_col',numerical_transformer,numerical_col),
                                                ('cat_col',categorical_transformer,categorical_col)
                                               ])                                                                                           
                                 


In [None]:
# model+pipeline
model = RandomForestClassifier(n_estimators=500,random_state=42,n_jobs=-1)

my_pipeline = Pipeline(steps=[('prepro',preprocessing)
                              ,('model',model )  
                             ])

In [None]:
# cross_validation
cross_validation =  cross_val_score(my_pipeline,X,y,
                                    cv=5,
                                   scoring = 'accuracy')
cross_validation.mean()

In [None]:
# train model
my_pipeline.fit(X_train,y_train)

In [None]:
#test model (validation)
preds = my_pipeline.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, preds))
print(classification_report(y_valid, preds))