In [2]:
import pandas as pd
import numpy as np

# Create raw data
data = {
    "Name": ["Aman", "Saurav", "Karan", "Shubham", "Abhishek", "Missing", "Sameer"],
    "Age": [22, 23, "Missing", 22, 25, 24, 26],
    "Marks": [85, "Missing", 88, 90, 56, 72, "Missing"],
    "Subject": ["Math", "Physics", "Math", "Chemistry", "Math", "Physics", "Chemistry"]
}

df = pd.DataFrame(data)

# Replace "Missing" with NaN (null)
df.replace("Missing", np.nan, inplace=True)

# Convert Age and Marks to numeric
df["Age"] = pd.to_numeric(df["Age"])
df["Marks"] = pd.to_numeric(df["Marks"])

print(df)

       Name   Age  Marks    Subject
0      Aman  22.0   85.0       Math
1    Saurav  23.0    NaN    Physics
2     Karan   NaN   88.0       Math
3   Shubham  22.0   90.0  Chemistry
4  Abhishek  25.0   56.0       Math
5       NaN  24.0   72.0    Physics
6    Sameer  26.0    NaN  Chemistry


  df.replace("Missing", np.nan, inplace=True)


In [2]:
df.shape

(7, 4)

In [3]:
df.dtypes

Name        object
Age        float64
Marks      float64
Subject     object
dtype: object

In [7]:
df.describe()

Unnamed: 0,Age,Marks
count,6.0,5.0
mean,23.666667,78.2
std,1.632993,14.254824
min,22.0,56.0
25%,22.25,72.0
50%,23.5,85.0
75%,24.75,88.0
max,26.0,90.0


In [8]:
df[["Name", "Marks"]]

Unnamed: 0,Name,Marks
0,Aman,85.0
1,Saurav,
2,Karan,88.0
3,Shubham,90.0
4,Abhishek,56.0
5,,72.0
6,Sameer,


In [9]:
df[df["Marks"] > 80]

Unnamed: 0,Name,Age,Marks,Subject
0,Aman,22.0,85.0,Math
2,Karan,,88.0,Math
3,Shubham,22.0,90.0,Chemistry


In [11]:
df[(df["Age"] > 22) & (df["Marks"] > 80)]

Unnamed: 0,Name,Age,Marks,Subject


In [13]:
df["Subject"].value_counts() #or

Subject
Math         3
Physics      2
Chemistry    2
Name: count, dtype: int64

In [14]:
df.groupby("Subject").size()

Subject
Chemistry    2
Math         3
Physics      2
dtype: int64

In [15]:
df.sort_values(by="Marks", ascending=False)

Unnamed: 0,Name,Age,Marks,Subject
3,Shubham,22.0,90.0,Chemistry
2,Karan,,88.0,Math
0,Aman,22.0,85.0,Math
5,,24.0,72.0,Physics
4,Abhishek,25.0,56.0,Math
1,Saurav,23.0,,Physics
6,Sameer,26.0,,Chemistry


In [16]:
df.nlargest(2, "Marks")

Unnamed: 0,Name,Age,Marks,Subject
3,Shubham,22.0,90.0,Chemistry
2,Karan,,88.0,Math


In [19]:
df.nsmallest(1, "Marks") #or

Unnamed: 0,Name,Age,Marks,Subject
4,Abhishek,25.0,56.0,Math


In [22]:
df[df["Marks"] == df["Marks"].min()]

Unnamed: 0,Name,Age,Marks,Subject
4,Abhishek,25.0,56.0,Math


In [24]:
df.isna().sum()

Name       1
Age        1
Marks      2
Subject    0
dtype: int64

In [26]:
df[df["Marks"].isna()]

Unnamed: 0,Name,Age,Marks,Subject
1,Saurav,23.0,,Physics
6,Sameer,26.0,,Chemistry


In [4]:
cleaned_df = df.dropna()

In [5]:
df.shape
cleaned_df.shape

(3, 4)

In [None]:
# 1️⃣ Because the column contains NaN, Pandas converts integers to float since integers cannot store NaN.

# 2️⃣ inplace=True permanently modifies the original DataFrame, causing possible data loss and making recovery difficult.

# 3️⃣ The Name column should be mandatory. Missing names indicate poor data entry; admin should enforce validation to prevent empty values.

GroupBy in Pandas

In [6]:
data = {
    "Name":["Aman","Saurav","Karan","Shubham","Abhishek",None,"Sameer"],
    "Age":[22,23,None,22,25,24,26],
    "Marks":[85,None,88,90,56,72,None],
    "Subject":["Math","Physics","Math","Chemistry","Math","Physics","Chemistry"]
}

df=pd.DataFrame(data)
print(df)

       Name   Age  Marks    Subject
0      Aman  22.0   85.0       Math
1    Saurav  23.0    NaN    Physics
2     Karan   NaN   88.0       Math
3   Shubham  22.0   90.0  Chemistry
4  Abhishek  25.0   56.0       Math
5      None  24.0   72.0    Physics
6    Sameer  26.0    NaN  Chemistry


In [10]:
# basic groupby
df.groupby('Subject')['Marks'].mean()  #mean marks per subject . default they are 

Subject
Chemistry    90.000000
Math         76.333333
Physics      72.000000
Name: Marks, dtype: float64

In [11]:
df.groupby('Subject')['Name'].count()    #count students per subject

Subject
Chemistry    2
Math         3
Physics      1
Name: Name, dtype: int64

In [13]:
# Multiple aggregation
df.groupby('Subject')['Marks'].agg(['mean','max','min'])

Unnamed: 0_level_0,mean,max,min
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chemistry,90.0,90.0,90.0
Math,76.333333,88.0,56.0
Physics,72.0,72.0,72.0


In [16]:
df.groupby(['Subject','Age',])['Marks'].mean()

Subject    Age 
Chemistry  22.0    90.0
           26.0     NaN
Math       22.0    85.0
           25.0    56.0
Physics    23.0     NaN
           24.0    72.0
Name: Marks, dtype: float64