In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline # <--- MOVED THIS IMPORT HERE!
import warnings

# Suppress harmless warnings for cleaner output
warnings.filterwarnings('ignore')

print("--- Titanic Survival Predictor Project ---")
print("1. Loading Data...")

# --- 1. Load Data ---
# Using the raw CSV directly from GitHub for easy 'readymade' use
try:
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    df = pd.read_csv(url)
    print("Data loaded successfully from URL.")
except Exception as e:
    print(f"Error loading data from URL: {e}")
    print("Please ensure you have an internet connection or download 'titanic.csv' manually.")
    print("Assuming 'titanic.csv' is in the same directory.")
    try:
        df = pd.read_csv('titanic.csv')
        print("Data loaded successfully from local file.")
    except FileNotFoundError:
        print("Error: 'titanic.csv' not found. Please place it in the same directory as the script.")
        exit() # Exit if data can't be loaded

# Display basic info
print("\nDataset Head:")
print(df.head())
print("\nDataset Info:")
df.info()

print("\n2. Preprocessing Data...")

# --- 2. Preprocessing Data ---

# Define features (X) and target (y)
# Dropping 'PassengerId', 'Name', 'Ticket', 'Cabin' as they are not directly useful or have too many missing values
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

X = df[features]
y = df[target]

# Separate features into numerical and categorical for different preprocessing steps
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked'] # Pclass treated as categorical for simplicity here

# Create preprocessing pipelines for numerical and categorical features
# Impute missing numerical values with the median
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # Scale numerical features
])

# Impute missing categorical values with the most frequent, then One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Convert categorical to one-hot vectors
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 3. Split Data ---
print("3. Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train, 20% test

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

print("\n4. Training the Model...")

# --- 4. Create and Train the Model (with pipeline for preprocessing) ---
# from sklearn.pipeline import Pipeline # <--- This line was here previously, causing the error!

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression(random_state=42))]) # Use Logistic Regression

# Train the model
model_pipeline.fit(X_train, y_train)
print("Model trained successfully using Logistic Regression.")

print("\n5. Evaluating the Model...")

# --- 5. Evaluate the Model ---
y_pred = model_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(conf_matrix)
print("  (True Negative  False Positive)")
print("  (False Negative True Positive )")


print("\n--- Project Complete ---")
print("This simple model demonstrates data loading, preprocessing, training, and evaluation.")


--- Titanic Survival Predictor Project ---
1. Loading Data...
Data loaded successfully from URL.

Dataset Head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        