<a href="https://colab.research.google.com/github/PravallikaSomisetti/CODSOFT/blob/main/titanic_survival_prediction_codsoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- 1. Load the Dataset ---
# Assuming 'train.csv' and 'test.csv' are in the same directory as this script.
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    # Store PassengerId for submission later
    passenger_ids = test_df['PassengerId']
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: Make sure 'train.csv' and 'test.csv' are in the correct directory.")
    print("You can download them from the Kaggle Titanic competition page.")
    # Continue execution, but subsequent steps will likely fail without dataframes
    print("Continuing execution without loading dataframes.")


# --- 2. Exploratory Data Analysis (EDA) ---
print("\n--- Training Data Info ---")
train_df.info()
print("\n--- Test Data Info ---")
test_df.info()

print("\n--- Training Data Head ---")
print(train_df.head())

print("\n--- Missing Values in Training Data ---")
print(train_df.isnull().sum())

print("\n--- Missing Values in Test Data ---")
print(test_df.isnull().sum())

print("\n--- Survival Rate ---")
print(train_df['Survived'].value_counts(normalize=True))

# Basic visualizations (uncomment to display)
# plt.figure(figsize=(10, 6))
# sns.countplot(x='Survived', data=train_df)
# plt.title('Survival Count')
# plt.show()

# plt.figure(figsize=(10, 6))
# sns.histplot(train_df['Age'].dropna(), bins=30, kde=True)
# plt.title('Age Distribution')
# plt.show()

# plt.figure(figsize=(12, 7))
# sns.heatmap(train_df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Matrix (Training Data)')
# plt.show()


# --- 3. Feature Engineering ---
def feature_engineer(df):
    # Create 'FamilySize' feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Create 'IsAlone' feature
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Extract 'Title' from 'Name'
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    # Group rare titles
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Create 'FarePerPerson'
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    # Handle potential division by zero if FamilySize can be 0 (though not in this dataset usually)
    df['FarePerPerson'].replace([np.inf, -np.inf], 0, inplace=True) # Replace inf with 0

    # Drop original Name, SibSp, Parch, Ticket, Cabin (Cabin too many missing)
    df = df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    return df

train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

print("\n--- Training Data after Feature Engineering ---")
print(train_df.head())


# --- 4. Data Preprocessing ---

# Separate target variable from training data
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Identify categorical and numerical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Remove 'PassengerId' from numerical features as it's just an identifier
if 'PassengerId' in numerical_features:
    numerical_features.remove('PassengerId')

print(f"\nNumerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with mean
    ('scaler', StandardScaler())                 # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Impute missing categorical values with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))   # One-hot encode categorical features
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (like PassengerId for test, though it will be dropped later)
)

# --- 5. Model Training ---

# Define the model
# You can try other models like LogisticRegression, GradientBoostingClassifier, etc.
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])
                        # ('classifier', LogisticRegression(solver='liblinear', random_state=42))])
                        # ('classifier', GradientBoostingClassifier(random_state=42))])

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n--- Training Model ---")
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate the model on the validation set
y_pred_val = model.predict(X_val)
print(f"\nValidation Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
print("\nValidation Classification Report:")
print(classification_report(y_val, y_pred_val))
print("\nValidation Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_val))

# Optional: Cross-validation for a more robust evaluation
print("\n--- Performing Cross-Validation ---")
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Std CV Accuracy: {cv_scores.std():.4f}")

# --- Optional: Hyperparameter Tuning with GridSearchCV ---
# This can take a long time to run, uncomment if you want to perform tuning
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10]
# }
# grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# print("\n--- Starting GridSearchCV for Hyperparameter Tuning ---")
# grid_search.fit(X, y)
# print(f"\nBest Parameters found: {grid_search.best_params_}")
# print(f"Best CV Score: {grid_search.best_score_:.4f}")
# # If using GridSearchCV, update the model with the best estimator
# # model = grid_search.best_estimator_
# print("Hyperparameter tuning complete.")


# --- 6. Prediction and Submission ---

# Make predictions on the test set
# The test_df also needs to be preprocessed with the same steps
print("\n--- Making Predictions on Test Data ---")
test_predictions = model.predict(test_df)

# Create submission file in the format expected by Kaggle
submission_df = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': test_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("First 5 rows of the submission file:")
print(submission_df.head())

Datasets loaded successfully.

--- Training Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

--- Test Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FarePerPerson'].replace([np.inf, -np.inf], 0, inplace=True) # Replace inf with 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FarePerPerson'].replace([np.inf, -np.inf], 0, inplace=True) # Replace inf with 0


Model training complete.

Validation Accuracy: 0.8436

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       105
           1       0.82      0.80      0.81        74

    accuracy                           0.84       179
   macro avg       0.84      0.84      0.84       179
weighted avg       0.84      0.84      0.84       179


Validation Confusion Matrix:
[[92 13]
 [15 59]]

--- Performing Cross-Validation ---
Cross-Validation Scores: [0.77653631 0.80898876 0.84831461 0.81460674 0.84269663]
Mean CV Accuracy: 0.8182
Std CV Accuracy: 0.0258

--- Making Predictions on Test Data ---

Submission file 'submission.csv' created successfully!
First 5 rows of the submission file:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
