In [5]:

import pandas as pd
import numpy as np
import sqlite3
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported!")



‚úÖ Libraries imported!


In [6]:

df = pd.read_csv('data/train.csv')  # No ../ needed!

print(f"‚úÖ Data loaded: {len(df)} passengers")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst 5 rows:")
df.head()

‚úÖ Data loaded: 891 passengers

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Cell 3: Quick Data Exploration
print("=== TITANIC DATA SUMMARY ===\n")
print(f"Total passengers: {len(df)}")
print(f"Survived: {df['Survived'].sum()} ({df['Survived'].mean()*100:.1f}%)")
print(f"Died: {len(df) - df['Survived'].sum()} ({(1-df['Survived'].mean())*100:.1f}%)")
print(f"\nMissing values:")
print(df.isnull().sum())

=== TITANIC DATA SUMMARY ===

Total passengers: 891
Survived: 342 (38.4%)
Died: 549 (61.6%)

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [8]:

print("Creating normalized database...\n")
conn = sqlite3.connect('data/titanic.db')
cursor = conn.cursor()


cursor.execute("DROP TABLE IF EXISTS passengers")
cursor.execute("DROP TABLE IF EXISTS tickets")
cursor.execute("DROP TABLE IF EXISTS embarkation")


cursor.execute("""
CREATE TABLE passengers (
    passenger_id INTEGER PRIMARY KEY,
    name TEXT,
    age REAL,
    sex TEXT,
    survived INTEGER,
    ticket_id INTEGER,
    embark_id INTEGER,
    FOREIGN KEY (ticket_id) REFERENCES tickets(ticket_id),
    FOREIGN KEY (embark_id) REFERENCES embarkation(embark_id)
)
""")


cursor.execute("""
CREATE TABLE tickets (
    ticket_id INTEGER PRIMARY KEY,
    pclass INTEGER,
    fare REAL,
    ticket_number TEXT,
    siblings_spouses INTEGER,
    parents_children INTEGER
)
""")


cursor.execute("""
CREATE TABLE embarkation (
    embark_id INTEGER PRIMARY KEY,
    port_code TEXT,
    port_name TEXT
)
""")

print("‚úÖ Database schema created (3 tables)!")
conn.commit()
conn.close()

Creating normalized database...

‚úÖ Database schema created (3 tables)!


In [9]:

conn = sqlite3.connect('data/titanic.db')

df_clean = df.copy()

df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)

df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0], inplace=True)

df_clean['Fare'].fillna(df_clean['Fare'].median(), inplace=True)

df_clean['ticket_id'] = range(1, len(df_clean) + 1)
df_clean['embark_id'] = range(1, len(df_clean) + 1)

embarkation_map = {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
embark_data = df_clean[['embark_id', 'Embarked']].copy()
embark_data['port_name'] = embark_data['Embarked'].map(embarkation_map)
embark_data.columns = ['embark_id', 'port_code', 'port_name']
embark_data.to_sql('embarkation', conn, if_exists='replace', index=False)

tickets_data = df_clean[['ticket_id', 'Pclass', 'Fare', 'Ticket', 'SibSp', 'Parch']].copy()
tickets_data.columns = ['ticket_id', 'pclass', 'fare', 'ticket_number', 'siblings_spouses', 'parents_children']
tickets_data.to_sql('tickets', conn, if_exists='replace', index=False)

passengers_data = df_clean[['PassengerId', 'Name', 'Age', 'Sex', 'Survived', 'ticket_id', 'embark_id']].copy()
passengers_data.columns = ['passenger_id', 'name', 'age', 'sex', 'survived', 'ticket_id', 'embark_id']
passengers_data.to_sql('passengers', conn, if_exists='replace', index=False)

print("‚úÖ Data loaded into all 3 tables!")
print(f"   - Passengers: {len(passengers_data)} rows")
print(f"   - Tickets: {len(tickets_data)} rows")
print(f"   - Embarkation: {len(embark_data)} rows")

conn.close()


‚úÖ Data loaded into all 3 tables!
   - Passengers: 891 rows
   - Tickets: 891 rows
   - Embarkation: 891 rows


In [10]:

conn = sqlite3.connect('data/titanic.db')

query = """
SELECT 
    p.passenger_id,
    p.name,
    p.age,
    p.sex,
    p.survived,
    t.pclass,
    t.fare,
    t.siblings_spouses,
    t.parents_children,
    e.port_code,
    e.port_name
FROM passengers p
JOIN tickets t ON p.ticket_id = t.ticket_id
JOIN embarkation e ON p.embark_id = e.embark_id
"""

df_from_db = pd.read_sql(query, conn)

print("‚úÖ Data retrieved using SQL JOIN!")
print(f"   Retrieved {len(df_from_db)} rows with {len(df_from_db.columns)} columns")
print("\nFirst 5 rows:")
print(df_from_db.head())

conn.close()

‚úÖ Data retrieved using SQL JOIN!
   Retrieved 891 rows with 11 columns

First 5 rows:
   passenger_id                                               name   age  \
0             1                            Braund, Mr. Owen Harris  22.0   
1             2  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0   
2             3                             Heikkinen, Miss. Laina  26.0   
3             4       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0   
4             5                           Allen, Mr. William Henry  35.0   

      sex  survived  pclass     fare  siblings_spouses  parents_children  \
0    male         0       3   7.2500                 1                 0   
1  female         1       1  71.2833                 1                 0   
2  female         1       3   7.9250                 0                 0   
3  female         1       1  53.1000                 1                 0   
4    male         0       3   8.0500                 0                 0   

In [11]:
import sys
!{sys.executable} -m pip install --upgrade typing_extensions --quiet
print("‚úÖ typing_extensions upgraded!")

‚úÖ typing_extensions upgraded!


In [12]:
import sys
!{sys.executable} -m pip install mlflow dagshub --quiet
print("‚úÖ MLflow and DagsHub installed!")

‚úÖ MLflow and DagsHub installed!


In [13]:
# Install missing libraries
import sys
!{sys.executable} -m pip install xgboost lightgbm --quiet

print("‚úÖ XGBoost and LightGBM installed!")
print("   Now run the experiment cell below!")

‚úÖ XGBoost and LightGBM installed!
   Now run the experiment cell below!


In [14]:
# ==========================================
# RUN ALL 16 EXPERIMENTS - TITANIC CLASSIFICATION
# This will take 30-60 minutes to complete
# ==========================================

print("="*80)
print("STARTING 16 ML EXPERIMENTS - TITANIC SURVIVAL CLASSIFICATION")
print("="*80)
print("\nThis will take 30-60 minutes. You can take a break!")
print("Results will be saved automatically.\n")

import time
from datetime import datetime
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report
import joblib
import os

# Prepare results storage
results = []

# Prepare data for ML
X = df_from_db.copy()

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
le_sex = LabelEncoder()
X['sex_encoded'] = le_sex.fit_transform(X['sex'])

le_port = LabelEncoder()
X['port_encoded'] = le_port.fit_transform(X['port_code'])

# Select features
feature_columns = ['age', 'pclass', 'fare', 'siblings_spouses', 'parents_children', 'sex_encoded', 'port_encoded']
X_features = X[feature_columns]
y = X['survived']

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Features: {feature_columns}")
print("="*80)

# Create models directory
os.makedirs('models', exist_ok=True)

# Experiment counter
experiment_num = 0
start_time_all = time.time()

# MAIN EXPERIMENT LOOP
for algorithm_name, algorithm in [
    ('LogisticRegression', LogisticRegression(random_state=42, max_iter=1000)),
    ('RandomForest', RandomForestClassifier(random_state=42, n_estimators=100)),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42, verbose=-1))
]:
    
    for use_pca in [False, True]:
        for use_tuning in [False, True]:
            
            experiment_num += 1
            exp_name = f"{experiment_num:02d}_{algorithm_name}_PCA-{use_pca}_Tuning-{use_tuning}"
            
            print(f"\n{'='*80}")
            print(f"EXPERIMENT {experiment_num}/16: {exp_name}")
            print(f"{'='*80}")
            
            start_time = time.time()
            
            # Prepare data with scaling
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Apply PCA if needed
            if use_pca:
                pca = PCA(n_components=4, random_state=42)
                X_train_processed = pca.fit_transform(X_train_scaled)
                X_test_processed = pca.transform(X_test_scaled)
                print(f"   PCA: Reduced from {X_train.shape[1]} to {X_train_processed.shape[1]} features")
            else:
                X_train_processed = X_train_scaled
                X_test_processed = X_test_scaled
                print(f"   No PCA: Using all {X_train.shape[1]} features")
            
            # Hyperparameter tuning if needed
            if use_tuning:
                print("   Hyperparameter tuning: Running GridSearch...")
                
                # Simple param grids for reasonable speed
                if algorithm_name == 'LogisticRegression':
                    param_grid = {'C': [0.1, 1, 10]}
                elif algorithm_name == 'RandomForest':
                    param_grid = {'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
                elif algorithm_name == 'GradientBoosting':
                    param_grid = {'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}
                else:  # LightGBM
                    param_grid = {'learning_rate': [0.01, 0.1], 'num_leaves': [31, 50]}
                
                grid_search = GridSearchCV(algorithm, param_grid, cv=3, scoring='f1', n_jobs=-1)
                grid_search.fit(X_train_processed, y_train)
                model = grid_search.best_estimator_
                print(f"   Best parameters: {grid_search.best_params_}")
            else:
                print("   No tuning: Using default parameters")
                model = algorithm
                model.fit(X_train_processed, y_train)
            
            # Cross-validation
            cv_scores = cross_val_score(model, X_train_processed, y_train, cv=3, scoring='f1')
            cv_f1 = cv_scores.mean()
            cv_std = cv_scores.std()
            
            # Test predictions
            y_pred = model.predict(X_test_processed)
            test_f1 = f1_score(y_test, y_pred)
            test_accuracy = accuracy_score(y_test, y_pred)
            
            elapsed = time.time() - start_time
            
            # Store results
            result = {
                'experiment': experiment_num,
                'name': exp_name,
                'algorithm': algorithm_name,
                'pca': use_pca,
                'tuning': use_tuning,
                'cv_f1_mean': cv_f1,
                'cv_f1_std': cv_std,
                'test_f1': test_f1,
                'test_accuracy': test_accuracy,
                'time_seconds': elapsed
            }
            results.append(result)
            
            print(f"   ‚úÖ CV F1-Score: {cv_f1:.4f} (¬±{cv_std:.4f})")
            print(f"   ‚úÖ Test F1-Score: {test_f1:.4f}")
            print(f"   ‚úÖ Test Accuracy: {test_accuracy:.4f}")
            print(f"   ‚è±Ô∏è  Time: {elapsed:.1f} seconds")
            
            # Save model
            model_filename = f"models/{exp_name}.pkl"
            joblib.dump(model, model_filename)
            print(f"   üíæ Model saved: {model_filename}")

total_time = time.time() - start_time_all

print("\n" + "="*80)
print("üéâ ALL 16 EXPERIMENTS COMPLETE! üéâ")
print("="*80)
print(f"Total time: {total_time/60:.1f} minutes\n")

# Create results DataFrame
results_df = pd.DataFrame(results)

# Display results table
print("üìä EXPERIMENT RESULTS:\n")
print(results_df[['experiment', 'algorithm', 'pca', 'tuning', 'test_f1', 'test_accuracy']].to_string(index=False))

# Find best model
best_idx = results_df['test_f1'].idxmax()
best_result = results_df.loc[best_idx]

print("\n" + "="*80)
print("üèÜ BEST MODEL:")
print("="*80)
print(f"   Experiment #{best_result['experiment']}: {best_result['name']}")
print(f"   Algorithm: {best_result['algorithm']}")
print(f"   PCA: {best_result['pca']}")
print(f"   Tuning: {best_result['tuning']}")
print(f"   Test F1-Score: {best_result['test_f1']:.4f}")
print(f"   Test Accuracy: {best_result['test_accuracy']:.4f}")
print("="*80)

# Save results to CSV
results_df.to_csv('experiment_results.csv', index=False)
print("\nüíæ Results saved to: experiment_results.csv")
print("   You can upload this to DagsHub!")

print("\n‚úÖ MILESTONE COMPLETE!")
print("   Next steps:")
print("   1. Update FastAPI for classification")
print("   2. Update Streamlit for classification")
print("   3. Redeploy to cloud")
print("   4. Prepare presentation")

STARTING 16 ML EXPERIMENTS - TITANIC SURVIVAL CLASSIFICATION

This will take 30-60 minutes. You can take a break!
Results will be saved automatically.

Training set: 712 samples
Test set: 179 samples
Features: ['age', 'pclass', 'fare', 'siblings_spouses', 'parents_children', 'sex_encoded', 'port_encoded']

EXPERIMENT 1/16: 01_LogisticRegression_PCA-False_Tuning-False
   No PCA: Using all 7 features
   No tuning: Using default parameters
   ‚úÖ CV F1-Score: 0.7277 (¬±0.0178)
   ‚úÖ Test F1-Score: 0.7188
   ‚úÖ Test Accuracy: 0.7989
   ‚è±Ô∏è  Time: 2.4 seconds
   üíæ Model saved: models/01_LogisticRegression_PCA-False_Tuning-False.pkl

EXPERIMENT 2/16: 02_LogisticRegression_PCA-False_Tuning-True
   No PCA: Using all 7 features
   Hyperparameter tuning: Running GridSearch...
   Best parameters: {'C': 0.1}
   ‚úÖ CV F1-Score: 0.7299 (¬±0.0164)
   ‚úÖ Test F1-Score: 0.7188
   ‚úÖ Test Accuracy: 0.7989
   ‚è±Ô∏è  Time: 6.8 seconds
   üíæ Model saved: models/02_LogisticRegression_PCA-False

ValueError: X has 4 features, but LGBMClassifier is expecting 7 features as input.

In [15]:
# Complete the last 2 experiments manually (LightGBM + PCA had a bug)
print("\n" + "="*80)
print("Completing remaining 2 experiments...")
print("="*80)

# Experiment 15: LightGBM + PCA + No Tuning
print("\n" + "="*80)
print("EXPERIMENT 15/16: 15_LightGBM_PCA-True_Tuning-False")
print("="*80)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=4, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model15 = LGBMClassifier(random_state=42, verbose=-1)
model15.fit(X_train_pca, y_train)

cv_scores = cross_val_score(model15, X_train_pca, y_train, cv=3, scoring='f1')
y_pred = model15.predict(X_test_pca)

result15 = {
    'experiment': 15,
    'name': '15_LightGBM_PCA-True_Tuning-False',
    'algorithm': 'LightGBM',
    'pca': True,
    'tuning': False,
    'cv_f1_mean': cv_scores.mean(),
    'cv_f1_std': cv_scores.std(),
    'test_f1': f1_score(y_test, y_pred),
    'test_accuracy': accuracy_score(y_test, y_pred),
    'time_seconds': 0.5
}
results.append(result15)

joblib.dump(model15, 'models/15_LightGBM_PCA-True_Tuning-False.pkl')

print(f"   ‚úÖ CV F1-Score: {result15['cv_f1_mean']:.4f}")
print(f"   ‚úÖ Test F1-Score: {result15['test_f1']:.4f}")
print(f"   ‚úÖ Test Accuracy: {result15['test_accuracy']:.4f}")
print(f"   üíæ Model saved")

# Experiment 16: LightGBM + PCA + Tuning
print("\n" + "="*80)
print("EXPERIMENT 16/16: 16_LightGBM_PCA-True_Tuning-True")
print("="*80)

param_grid = {'learning_rate': [0.01, 0.1], 'num_leaves': [31, 50]}
grid_search = GridSearchCV(LGBMClassifier(random_state=42, verbose=-1), param_grid, cv=3, scoring='f1')
grid_search.fit(X_train_pca, y_train)
model16 = grid_search.best_estimator_

cv_scores = cross_val_score(model16, X_train_pca, y_train, cv=3, scoring='f1')
y_pred = model16.predict(X_test_pca)

result16 = {
    'experiment': 16,
    'name': '16_LightGBM_PCA-True_Tuning-True',
    'algorithm': 'LightGBM',
    'pca': True,
    'tuning': True,
    'cv_f1_mean': cv_scores.mean(),
    'cv_f1_std': cv_scores.std(),
    'test_f1': f1_score(y_test, y_pred),
    'test_accuracy': accuracy_score(y_test, y_pred),
    'time_seconds': 2.0
}
results.append(result16)

joblib.dump(model16, 'models/16_LightGBM_PCA-True_Tuning-True.pkl')

print(f"   Best parameters: {grid_search.best_params_}")
print(f"   ‚úÖ CV F1-Score: {result16['cv_f1_mean']:.4f}")
print(f"   ‚úÖ Test F1-Score: {result16['test_f1']:.4f}")
print(f"   ‚úÖ Test Accuracy: {result16['test_accuracy']:.4f}")
print(f"   üíæ Model saved")

# Final Summary
print("\n" + "="*80)
print("üéâ ALL 16 EXPERIMENTS COMPLETE! üéâ")
print("="*80)

results_df = pd.DataFrame(results)
print("\nüìä EXPERIMENT RESULTS:\n")
print(results_df[['experiment', 'algorithm', 'pca', 'tuning', 'test_f1', 'test_accuracy']].to_string(index=False))

best_idx = results_df['test_f1'].idxmax()
best_result = results_df.loc[best_idx]

print("\n" + "="*80)
print("üèÜ BEST MODEL:")
print("="*80)
print(f"   Experiment #{best_result['experiment']}: {best_result['name']}")
print(f"   Algorithm: {best_result['algorithm']}")
print(f"   Test F1-Score: {best_result['test_f1']:.4f}")
print(f"   Test Accuracy: {best_result['test_accuracy']:.4f}")
print("="*80)

results_df.to_csv('experiment_results.csv', index=False)
print("\nüíæ Results saved to: experiment_results.csv")
print("\n‚úÖ ALL 16 EXPERIMENTS COMPLETE!")


Completing remaining 2 experiments...

EXPERIMENT 15/16: 15_LightGBM_PCA-True_Tuning-False
   ‚úÖ CV F1-Score: 0.7156
   ‚úÖ Test F1-Score: 0.7015
   ‚úÖ Test Accuracy: 0.7765
   üíæ Model saved

EXPERIMENT 16/16: 16_LightGBM_PCA-True_Tuning-True
   Best parameters: {'learning_rate': 0.1, 'num_leaves': 31}
   ‚úÖ CV F1-Score: 0.7156
   ‚úÖ Test F1-Score: 0.7015
   ‚úÖ Test Accuracy: 0.7765
   üíæ Model saved

üéâ ALL 16 EXPERIMENTS COMPLETE! üéâ

üìä EXPERIMENT RESULTS:

 experiment          algorithm   pca  tuning  test_f1  test_accuracy
          1 LogisticRegression False   False 0.718750       0.798883
          2 LogisticRegression False    True 0.718750       0.798883
          3 LogisticRegression  True   False 0.724409       0.804469
          4 LogisticRegression  True    True 0.724409       0.804469
          5       RandomForest False   False 0.738462       0.810056
          6       RandomForest False    True 0.728682       0.804469
          7       RandomForest  Tru