<a href="https://colab.research.google.com/github/Rohanraj1330/codsoft/blob/main/TITANIC_SURVIVAL_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files
import io
import os # Import os module for path manipulation

# --- Step 1: Upload the necessary datasets ---
# You will be prompted to upload 'train.csv' and 'test.csv'.
# Make sure you select both files when the upload dialog appears.
print("Please upload 'train.csv' and 'test.csv' when prompted.")
uploaded = files.upload()

# --- Step 2: Load the uploaded datasets ---
# We need to find the correct filenames in the 'uploaded' dictionary.
# Colab might append numbers to filenames if you upload multiple times.

train_file_name = None
test_file_name = None

# Iterate through uploaded files to find the ones we need,
# accounting for potential (X) suffixes
for name in uploaded.keys():
    if name.startswith('train.csv'):
        train_file_name = name
    if name.startswith('test.csv'):
        test_file_name = name

if train_file_name is None or test_file_name is None:
    print("Error: Could not find 'train.csv' or 'test.csv' among uploaded files. Please re-run the cell and upload both files correctly.")
else:
    try:
        train_df = pd.read_csv(io.BytesIO(uploaded[train_file_name]))
        test_df = pd.read_csv(io.BytesIO(uploaded[test_file_name]))
        print(f"\nDatasets loaded successfully! (train_df from {train_file_name}, test_df from {test_file_name})")
    except Exception as e:
        print(f"Error loading datasets: {e}")
        # Exit or handle the error appropriately if files can't be read
        exit()

    # --- Rest of your code (no changes needed here) ---

    # --- Step 3: Initial Data Exploration and Preprocessing for Training Data ---
    print("\n--- Training Data Initial Info ---")
    print(train_df.head())
    print("\n")
    print(train_df.info())
    print("\nMissing values in training data before imputation:")
    print(train_df.isnull().sum())

    print("\nSurvival rate by Gender:")
    print(train_df.groupby('Sex')['Survived'].mean())
    print("\nSurvival rate by Pclass:")
    print(train_df.groupby('Pclass')['Survived'].mean())

    # --- Step 4: Feature Engineering and Cleaning ---

    # Fill missing 'Age' values with the median for both train and test sets
    train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
    test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

    # Fill missing 'Embarked' with the most common value in the training set
    train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

    # Fill missing 'Fare' in test set with its median (as there might be some missing values)
    test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

    # Store PassengerIds for submission before dropping from test_df
    test_passenger_ids = test_df['PassengerId']

    # Drop irrelevant columns from both datasets
    train_df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
    test_df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)

    # Convert 'Sex' to numerical (0 for male, 1 for female)
    train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
    test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

    # One-hot encode 'Embarked' for both datasets
    train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True, dtype=int)
    test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True, dtype=int)

    # --- Step 5: Define Features (X) and Target (y) and Split Data ---
    X = train_df.drop('Survived', axis=1)
    y = train_df['Survived']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


    # --- Step 6: Align columns between train and test sets after one-hot encoding ---
    train_cols = set(X_train.columns)
    test_cols = set(test_df.columns)

    missing_in_test = list(train_cols - test_cols)
    for col in missing_in_test:
        test_df[col] = 0

    missing_in_train = list(test_cols - train_cols)
    for col in missing_in_train:
        X_train[col] = 0
        X_val[col] = 0

    common_and_ordered_cols = list(X_train.columns)
    test_df = test_df[common_and_ordered_cols]


    print("\nColumns after alignment:")
    print("X_train columns:", X_train.columns.tolist())
    print("test_df columns:", test_df.columns.tolist())
    print(f"Are X_train and test_df columns identical? {X_train.columns.equals(test_df.columns)}")


    print("\n--- Model Training ---")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_val shape: {y_val.shape}")

    # Check for any remaining NaNs before training
    print("\nMissing values in X_train before model training:")
    print(X_train.isnull().sum().sum())
    print("Missing values in X_val before model training:")
    print(X_val.isnull().sum().sum())
    print("Missing values in test_df before model prediction:")
    print(test_df.isnull().sum().sum())


    # --- Step 7: Train the Logistic Regression Model ---
    model = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)
    model.fit(X_train, y_train)

    # --- Step 8: Evaluate the Model on the Validation Set ---
    print("\n--- Model Evaluation on Validation Set ---")
    y_pred = model.predict(X_val)

    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

    # --- Step 9: Make Predictions on the Test Dataset ---
    print("\n--- Making Predictions on Test Data ---")
    test_predictions = model.predict(test_df)

    # --- Step 10: Prepare and Save Submission File ---
    submission = pd.DataFrame({
        'PassengerId': test_passenger_ids,
        'Survived': test_predictions
    })

    submission.to_csv('titanic_submission.csv', index=False)
    print("\nSubmission file 'titanic_submission.csv' created successfully!")

    # You can download the submission file directly from Colab
    # files.download('titanic_submission.csv')

Please upload 'train.csv' and 'test.csv' when prompted.


Saving gender_submission.csv to gender_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv

Datasets loaded successfully! (train_df from train.csv, test_df from test.csv)

--- Training Data Initial Info ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   N

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 