<a href="https://colab.research.google.com/github/Olanle/Project-004-Titanic-survival-prediction-with-preprocessing-pipeline-/blob/main/004.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
pip install kaggle




In [5]:
import pandas as pd

data = pd.read_csv("/content/Titanic-Dataset.csv")
print(data.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [6]:
print("Dataset shape:", data.shape)  # rows, columns

print("\nDataset Info:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe())

print("\nMissing Values per Column:")
print(data.isnull().sum())

print("\nColumns in Dataset:")
print(data.columns)


Dataset shape: (891, 12)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

Summary Statistics:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.30

In [7]:
#Feature Engineering

#Extract Title from Name
data['Title'] = data['Name'].str.extract(r",\s*([^\.]+)\.", expand=False).str.strip()

#Simplify rare titles
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major',
               'Rev', 'Sir', 'Jonkheer', 'Dona']
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['Title'] = data['Title'].replace(rare_titles, 'Rare')

#Create FamilySize
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

#Create IsAlone
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

#Extract CabinLetter
data['CabinLetter'] = data['Cabin'].fillna('Unknown').astype(str).str[0]

#Fare per person
data['FarePerPerson'] = data['Fare'] / data['FamilySize']

#Preview updated dataset
print("Feature engineering completed!")
print(data[['Name','Title','FamilySize','IsAlone','Cabin','CabinLetter','Fare','FarePerPerson']].head())


Feature engineering completed!
                                                Name Title  FamilySize  \
0                            Braund, Mr. Owen Harris    Mr           2   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   Mrs           2   
2                             Heikkinen, Miss. Laina  Miss           1   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   Mrs           2   
4                           Allen, Mr. William Henry    Mr           1   

   IsAlone Cabin CabinLetter     Fare  FarePerPerson  
0        0   NaN           U   7.2500        3.62500  
1        0   C85           C  71.2833       35.64165  
2        1   NaN           U   7.9250        7.92500  
3        0  C123           C  53.1000       26.55000  
4        1   NaN           U   8.0500        8.05000  


In [8]:
#Define numeric and categorical columns
numeric_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
categorical_features = ['Sex', 'Pclass', 'Embarked', 'Title', 'IsAlone', 'CabinLetter']

#Numeric transformer: fill missing with median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Categorical transformer: fill missing with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#Full pipeline: preprocessing + RandomForest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

print("Preprocessing pipeline created successfully!")

Preprocessing pipeline created successfully!


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Define features (X) and target (y)
X = data.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'])
y = data['Survived']

#Split into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#Fit the pipeline on training data
pipeline.fit(X_train, y_train)

#Predict on validation set
y_pred = pipeline.predict(X_val)

#Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


Validation Accuracy: 0.7877

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       110
           1       0.75      0.68      0.71        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179

Confusion Matrix:
[[94 16]
 [22 47]]


In [10]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# Display the results
print("Cross-Validation Accuracy Scores:", cv_scores)
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")


Cross-Validation Accuracy Scores: [0.79888268 0.79775281 0.84269663 0.75842697 0.83146067]
Mean CV Accuracy: 0.8058
Standard Deviation: 0.0296


In [11]:
import joblib
import pandas as pd

#Save the trained pipeline
joblib.dump(pipeline, 'titanic_pipeline_model.joblib')
print("Trained model pipeline saved as 'titanic_pipeline_model.joblib'")

#Save evaluation metrics
metrics = {
    'accuracy': [accuracy],
}

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('metrics.csv', index=False)
print("Validation metrics saved as 'metrics.csv'")

Trained model pipeline saved as 'titanic_pipeline_model.joblib'
Validation metrics saved as 'metrics.csv'
