# AI & ML Internship — Task 1
## Dataset Understanding & Data Type Analysis
### By: Pranav SP

This notebook performs dataset analysis using a modular and professional approach.


In [33]:
import sys
sys.path.append("../src")

from loader import load_csv
from analyzer import dataset_summary, classify_features
from utils import print_dict_as_table

import pandas as pd
import numpy as np


In [34]:
df = load_csv("../data/titanic.csv")
df.head()

[INFO] Loaded dataset successfully from ../data/titanic.csv


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [36]:
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
626,627,0,2,"Kirkland, Rev. Charles Leonard",male,57.0,0,0,219533,12.35,,Q
216,217,1,3,"Honkanen, Miss. Eliina",female,27.0,0,0,STON/O2. 3101283,7.925,,S
143,144,0,3,"Burke, Mr. Jeremiah",male,19.0,0,0,365222,6.75,,Q
247,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
551,552,0,2,"Sharp, Mr. Percival James R",male,27.0,0,0,244358,26.0,,S
739,740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S


In [37]:
summary = dataset_summary(df)
summary

{'shape': (891, 12),
 'columns': ['PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked'],
 'dtypes': {'PassengerId': 'int64',
  'Survived': 'int64',
  'Pclass': 'int64',
  'Name': 'object',
  'Sex': 'object',
  'Age': 'float64',
  'SibSp': 'int64',
  'Parch': 'int64',
  'Ticket': 'object',
  'Fare': 'float64',
  'Cabin': 'object',
  'Embarked': 'object'},
 'null_counts': {'PassengerId': 0,
  'Survived': 0,
  'Pclass': 0,
  'Name': 0,
  'Sex': 0,
  'Age': 177,
  'SibSp': 0,
  'Parch': 0,
  'Ticket': 0,
  'Fare': 0,
  'Cabin': 687,
  'Embarked': 2},
 'unique_counts': {'PassengerId': 891,
  'Survived': 2,
  'Pclass': 3,
  'Name': 891,
  'Sex': 2,
  'Age': 88,
  'SibSp': 7,
  'Parch': 7,
  'Ticket': 681,
  'Fare': 248,
  'Cabin': 147,
  'Embarked': 3}}

In [None]:
print_dict_as_table(summary["dtypes"], "Data Types")
print_dict_as_table(summary["null_counts"], "Missing Values")
print_dict_as_table(summary["unique_counts"], "Unique Values")


=== Data Types ===
PassengerId          : int64
Survived             : int64
Pclass               : int64
Name                 : object
Sex                  : object
Age                  : float64
SibSp                : int64
Parch                : int64
Ticket               : object
Fare                 : float64
Cabin                : object
Embarked             : object


=== Missing Values ===
PassengerId          : 0
Survived             : 0
Pclass               : 0
Name                 : 0
Sex                  : 0
Age                  : 177
SibSp                : 0
Parch                : 0
Ticket               : 0
Fare                 : 0
Cabin                : 687
Embarked             : 2


=== Unique Values ===
PassengerId          : 891
Survived             : 2
Pclass               : 3
Name                 : 891
Sex                  : 2
Age                  : 88
SibSp                : 7
Parch                : 7
Ticket               : 681
Fare                 : 248
Cabin      

In [41]:
numerical, categorical, binary, ordinal = classify_features(df)
numerical, categorical, binary, ordinal

(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
 ['Survived'],
 [])

### Ordinal Features Identified Manually
- Pclass → has natural ordering (1 < 2 < 3)

In [40]:
for col in categorical:
    print(f"Column: {col}")
    print(df[col].value_counts())
    print("\n------------------------\n")

Column: Name
Name
Dooley, Mr. Patrick                                    1
Braund, Mr. Owen Harris                                1
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    1
Heikkinen, Miss. Laina                                 1
Futrelle, Mrs. Jacques Heath (Lily May Peel)           1
                                                      ..
Hewlett, Mrs. (Mary D Kingcome)                        1
Vestrom, Miss. Hulda Amanda Adolfina                   1
Andersson, Mr. Anders Johan                            1
Saundercock, Mr. William Henry                         1
Bonnell, Miss. Elizabeth                               1
Name: count, Length: 891, dtype: int64

------------------------

Column: Sex
Sex
male      577
female    314
Name: count, dtype: int64

------------------------

Column: Ticket
Ticket
347082              7
1601                7
CA. 2343            7
3101295             6
CA 2144             6
                   ..
PC 17590            1
17463           

### Target Variable for ML
For Titanic dataset → **Survived**

Reason:
- Binary class (0 or 1)
- Predictable using features like Age, Sex, Pclass, Fare, etc.

## ML Readiness Assessment

1. Missing Values:
- Age has missing values → impute using median
- Cabin has many missing values → drop or extract deck letter

2. Feature Issues:
- Ticket number irrelevant → drop
- Name contains title → extract useful features ("Mr", "Mrs", etc.)

3. Class Imbalance:
- More males than females → affects ML predictions

4. Verdict:
Dataset is suitable for ML after:
- Cleaning
- Encoding
- Imputation
- Feature engineering