# AI & ML Internship â€” Task 1
## Understanding Dataset & Data Types

### Objective:
- Load and explore datasets
- Identify data types
- Check missing values and imbalance
- Analyze ML readiness


In [1]:
# Import required libraries
import pandas as pd
import numpy as np

# Display settings for better visibility
pd.set_option("display.max_columns", None)


In [2]:
# Load Titanic dataset
titanic = pd.read_csv("train.csv")

# Load Students Performance dataset
students = pd.read_csv("StudentsPerformance.csv")

print("Titanic Shape:", titanic.shape)
print("Students Shape:", students.shape)


Titanic Shape: (891, 12)
Students Shape: (1000, 8)


In [3]:
# Display first 5 rows
titanic.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Display last 5 rows
titanic.tail()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
students.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
# Information about Titanic dataset
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# Information about Students dataset
students.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [8]:
# Statistical summary of Titanic
titanic.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Statistical summary of Students dataset
students.describe()


Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [10]:
# Identify numerical and categorical columns
def classify_columns(df):
    numerical = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical = df.select_dtypes(include=["object"]).columns.tolist()
    return numerical, categorical

t_num, t_cat = classify_columns(titanic)
s_num, s_cat = classify_columns(students)

print("Titanic Numerical:", t_num)
print("Titanic Categorical:", t_cat)

print("\nStudents Numerical:", s_num)
print("Students Categorical:", s_cat)


Titanic Numerical: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Titanic Categorical: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

Students Numerical: ['math score', 'reading score', 'writing score']
Students Categorical: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']


In [11]:
# Missing values in Titanic dataset
titanic.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
# Missing values in Students dataset
students.isnull().sum()


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [13]:
# Unique values in categorical columns (Titanic)
for col in t_cat:
    print(f"\n{col} unique values:")
    print(titanic[col].value_counts())



Name unique values:
Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: count, Length: 891, dtype: int64

Sex unique values:
Sex
male      577
female    314
Name: count, dtype: int64

Ticket unique values:
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

Cabin unique values:
Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              .

In [14]:
# Unique values in categorical columns (Students)
for col in s_cat:
    print(f"\n{col} unique values:")
    print(students[col].value_counts())



gender unique values:
gender
female    518
male      482
Name: count, dtype: int64

race/ethnicity unique values:
race/ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

parental level of education unique values:
parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

lunch unique values:
lunch
standard        645
free/reduced    355
Name: count, dtype: int64

test preparation course unique values:
test preparation course
none         642
completed    358
Name: count, dtype: int64


In [15]:
# Target variables
print("Titanic Target:", "Survived")
print("Students Target:", "math score / reading score / writing score (regression possible)")


Titanic Target: Survived
Students Target: math score / reading score / writing score (regression possible)


In [16]:
# Check class balance for Titanic
titanic["Survived"].value_counts(normalize=True)


Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [17]:
ml_readiness = {
    "Titanic": {
        "Missing Values": titanic.isnull().sum().sum(),
        "Target Variable": "Survived",
        "Imbalance": titanic["Survived"].value_counts(normalize=True).to_dict(),
        "Size": titanic.shape
    },
    "Students": {
        "Missing Values": students.isnull().sum().sum(),
        "Target Variable": "Scores",
        "Size": students.shape
    }
}

ml_readiness


{'Titanic': {'Missing Values': 866,
  'Target Variable': 'Survived',
  'Imbalance': {0: 0.6161616161616161, 1: 0.3838383838383838},
  'Size': (891, 12)},
 'Students': {'Missing Values': 0,
  'Target Variable': 'Scores',
  'Size': (1000, 8)}}