In [4]:
# ==========================================
# Task 1: Understanding Dataset & Data Types
# Dataset: Titanic
# ==========================================

import pandas as pd
import numpy as np

# -----------------------
# 1. Load Dataset
# -----------------------
df = pd.read_csv("tested.csv")
print("Dataset Shape:", df.shape)

# Display first and last records
print("\nFirst 5 rows:")
display(df.head())

print("\nLast 5 rows:")
display(df.tail())

# -----------------------
# 2. Dataset Information
# -----------------------
print("\nDataset Info:")
df.info()

# -----------------------
# 3. Statistical Summary
# -----------------------
print("\nStatistical Summary:")
display(df.describe())

# -----------------------
# 4. Identify Feature Types (Manual Inspection)
# -----------------------
numerical_features = ["Age", "Fare", "SibSp", "Parch"]
categorical_features = ["Sex", "Embarked", "Ticket", "Cabin"]
ordinal_features = ["Pclass"]
binary_features = ["Survived"]

print("\nFeature Classification:")
print("Numerical:", numerical_features)
print("Categorical:", categorical_features)
print("Ordinal:", ordinal_features)
print("Binary:", binary_features)

# -----------------------
# 5. Unique Values in Categorical Columns
# -----------------------
print("\nUnique Values in Categorical Columns:")
for col in categorical_features:
    print(f"\n{col} unique values:")
    print(df[col].unique())

# -----------------------
# 6. Identify Target & Input Features
# -----------------------
target_variable = "Survived"
input_features = df.drop(columns=[target_variable]).columns.tolist()

print("\nTarget Variable:", target_variable)
print("Input Features:", input_features)

# -----------------------
# 7. Missing Value Analysis
# -----------------------
print("\nMissing Values per Column:")
print(df.isnull().sum())

# -----------------------
# 8. Data Imbalance Check (Target Variable)
# -----------------------
print("\nTarget Variable Distribution:")
print(df[target_variable].value_counts())

# -----------------------
# 9. ML Suitability Analysis
# -----------------------
print("""
ML READINESS OBSERVATIONS:
- Dataset size is sufficient for machine learning
- Target variable is clearly defined
- Some features contain missing values (Age, Cabin, Embarked)
- Target variable is slightly imbalanced
- Dataset requires cleaning and encoding before modeling
""")


Dataset Shape: (418, 12)

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S



Last 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB

Statistical Summary:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292



Feature Classification:
Numerical: ['Age', 'Fare', 'SibSp', 'Parch']
Categorical: ['Sex', 'Embarked', 'Ticket', 'Cabin']
Ordinal: ['Pclass']
Binary: ['Survived']

Unique Values in Categorical Columns:

Sex unique values:
['male' 'female']

Embarked unique values:
['Q' 'S' 'C']

Ticket unique values:
['330911' '363272' '240276' '315154' '3101298' '7538' '330972' '248738'
 '2657' 'A/4 48871' '349220' '694' '21228' '24065' 'W.E.P. 5734'
 'SC/PARIS 2167' '233734' '2692' 'STON/O2. 3101270' '2696' 'PC 17603'
 'C 17368' 'PC 17598' 'PC 17597' 'PC 17608' 'A/5. 3337' '113509' '2698'
 '113054' '2662' 'SC/AH 3085' 'C.A. 31029' 'C.A. 2315' 'W./C. 6607'
 '13236' '2682' '342712' '315087' '345768' '1601' '349256' '113778'
 'SOTON/O.Q. 3101263' '237249' '11753' 'STON/O 2. 3101291' 'PC 17594'
 '370374' '11813' 'C.A. 37671' '13695' 'SC/PARIS 2168' '29105' '19950'
 'SC/A.3 2861' '382652' '349230' '348122' '386525' '349232' '237216'
 '347090' '334914' 'F.C.C. 13534' '330963' '113796' '2543' '382653'
 '349