In [4]:

import pandas as pd
import numpy as np
import sqlite3
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported!")



✅ Libraries imported!


In [5]:

df = pd.read_csv('data/train.csv')  # No ../ needed!

print(f"✅ Data loaded: {len(df)} passengers")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst 5 rows:")
df.head()

✅ Data loaded: 891 passengers

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Cell 3: Quick Data Exploration
print("=== TITANIC DATA SUMMARY ===\n")
print(f"Total passengers: {len(df)}")
print(f"Survived: {df['Survived'].sum()} ({df['Survived'].mean()*100:.1f}%)")
print(f"Died: {len(df) - df['Survived'].sum()} ({(1-df['Survived'].mean())*100:.1f}%)")
print(f"\nMissing values:")
print(df.isnull().sum())

=== TITANIC DATA SUMMARY ===

Total passengers: 891
Survived: 342 (38.4%)
Died: 549 (61.6%)

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [8]:

print("Creating normalized database...\n")
conn = sqlite3.connect('data/titanic.db')
cursor = conn.cursor()


cursor.execute("DROP TABLE IF EXISTS passengers")
cursor.execute("DROP TABLE IF EXISTS tickets")
cursor.execute("DROP TABLE IF EXISTS embarkation")


cursor.execute("""
CREATE TABLE passengers (
    passenger_id INTEGER PRIMARY KEY,
    name TEXT,
    age REAL,
    sex TEXT,
    survived INTEGER,
    ticket_id INTEGER,
    embark_id INTEGER,
    FOREIGN KEY (ticket_id) REFERENCES tickets(ticket_id),
    FOREIGN KEY (embark_id) REFERENCES embarkation(embark_id)
)
""")


cursor.execute("""
CREATE TABLE tickets (
    ticket_id INTEGER PRIMARY KEY,
    pclass INTEGER,
    fare REAL,
    ticket_number TEXT,
    siblings_spouses INTEGER,
    parents_children INTEGER
)
""")


cursor.execute("""
CREATE TABLE embarkation (
    embark_id INTEGER PRIMARY KEY,
    port_code TEXT,
    port_name TEXT
)
""")

print("✅ Database schema created (3 tables)!")
conn.commit()
conn.close()

Creating normalized database...

✅ Database schema created (3 tables)!


In [9]:

conn = sqlite3.connect('data/titanic.db')

df_clean = df.copy()

df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)

df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0], inplace=True)

df_clean['Fare'].fillna(df_clean['Fare'].median(), inplace=True)

df_clean['ticket_id'] = range(1, len(df_clean) + 1)
df_clean['embark_id'] = range(1, len(df_clean) + 1)

embarkation_map = {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
embark_data = df_clean[['embark_id', 'Embarked']].copy()
embark_data['port_name'] = embark_data['Embarked'].map(embarkation_map)
embark_data.columns = ['embark_id', 'port_code', 'port_name']
embark_data.to_sql('embarkation', conn, if_exists='replace', index=False)

tickets_data = df_clean[['ticket_id', 'Pclass', 'Fare', 'Ticket', 'SibSp', 'Parch']].copy()
tickets_data.columns = ['ticket_id', 'pclass', 'fare', 'ticket_number', 'siblings_spouses', 'parents_children']
tickets_data.to_sql('tickets', conn, if_exists='replace', index=False)

passengers_data = df_clean[['PassengerId', 'Name', 'Age', 'Sex', 'Survived', 'ticket_id', 'embark_id']].copy()
passengers_data.columns = ['passenger_id', 'name', 'age', 'sex', 'survived', 'ticket_id', 'embark_id']
passengers_data.to_sql('passengers', conn, if_exists='replace', index=False)

print("✅ Data loaded into all 3 tables!")
print(f"   - Passengers: {len(passengers_data)} rows")
print(f"   - Tickets: {len(tickets_data)} rows")
print(f"   - Embarkation: {len(embark_data)} rows")

conn.close()


✅ Data loaded into all 3 tables!
   - Passengers: 891 rows
   - Tickets: 891 rows
   - Embarkation: 891 rows


In [12]:

conn = sqlite3.connect('data/titanic.db')

query = """
SELECT 
    p.passenger_id,
    p.name,
    p.age,
    p.sex,
    p.survived,
    t.pclass,
    t.fare,
    t.siblings_spouses,
    t.parents_children,
    e.port_code,
    e.port_name
FROM passengers p
JOIN tickets t ON p.ticket_id = t.ticket_id
JOIN embarkation e ON p.embark_id = e.embark_id
"""

df_from_db = pd.read_sql(query, conn)

print("✅ Data retrieved using SQL JOIN!")
print(f"   Retrieved {len(df_from_db)} rows with {len(df_from_db.columns)} columns")
print("\nFirst 5 rows:")
print(df_from_db.head())

conn.close()

✅ Data retrieved using SQL JOIN!
   Retrieved 891 rows with 11 columns

First 5 rows:
   passenger_id                                               name   age  \
0             1                            Braund, Mr. Owen Harris  22.0   
1             2  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0   
2             3                             Heikkinen, Miss. Laina  26.0   
3             4       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0   
4             5                           Allen, Mr. William Henry  35.0   

      sex  survived  pclass     fare  siblings_spouses  parents_children  \
0    male         0       3   7.2500                 1                 0   
1  female         1       1  71.2833                 1                 0   
2  female         1       3   7.9250                 0                 0   
3  female         1       1  53.1000                 1                 0   
4    male         0       3   8.0500                 0                 0   



In [13]:
import sys
!{sys.executable} -m pip install --upgrade typing_extensions --quiet
print("✅ typing_extensions upgraded!")

✅ typing_extensions upgraded!


In [14]:
import sys
!{sys.executable} -m pip install mlflow dagshub --quiet
print("✅ MLflow and DagsHub installed!")

✅ MLflow and DagsHub installed!


In [1]:
%pip install -q dagshub mlflow


Note: you may need to restart the kernel to use updated packages.


In [1]:
import dagshub
dagshub.init(repo_owner='ShreyasAravind', repo_name='titanic-classification', mlflow=True)

ImportError: cannot import name 'Sentinel' from 'typing_extensions' (/Users/shreyas/opt/anaconda3/lib/python3.9/site-packages/typing_extensions.py)