In [1]:
import numpy as np
import pandas as pd

Dataset 1: Titanic-Like Passenger Dataset

In [3]:
np.random.seed(1)
rows = 200

df = pd.DataFrame({
    "PassengerId": np.arange(1, rows + 1),
    "Name": np.random.choice(["Mr. John", "Mrs. Anna", "Miss Emma", "Dr. Smith"], rows),
    "Age": np.random.choice(np.append(np.random.randint(1, 80, 180), [np.nan]*20), rows),
    "Sex": np.random.choice(["male", "female"], rows),
    "Pclass": np.random.choice([1, 2, 3], rows),
    "Fare": np.round(np.random.uniform(10, 500, rows), 2)
})

In [None]:
#View Data
df.head()
df.tail()
df.sample(3)

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
128,129,Mr. John,18.0,male,3,338.25
175,176,Mr. John,33.0,female,3,329.26
41,42,Mrs. Anna,36.0,male,1,47.32


In [None]:
#Inspect Structure
df.shape
df.columns
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  200 non-null    int64  
 1   Name         200 non-null    object 
 2   Age          182 non-null    float64
 3   Sex          200 non-null    object 
 4   Pclass       200 non-null    int64  
 5   Fare         200 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 9.5+ KB


Unnamed: 0,PassengerId,Age,Pclass,Fare
count,200.0,182.0,200.0,200.0
mean,100.5,43.318681,2.035,251.2497
std,57.879185,22.078153,0.804369,146.847353
min,1.0,1.0,1.0,11.25
25%,50.75,25.25,1.0,115.55
50%,100.5,41.0,2.0,265.43
75%,150.25,63.0,3.0,371.4325
max,200.0,79.0,3.0,493.68


In [None]:
#Handle Missing Data
df.isnull().sum()
df["Age"].fillna(df["Age"].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)


In [None]:
#Filtering Rows
df[df["Age"] > 30]
df[df["Sex"] == "female"]

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
3,4,Mr. John,63.000000,female,2,92.36
4,5,Dr. Smith,39.000000,female,1,346.02
5,6,Mrs. Anna,26.000000,female,2,304.88
8,9,Dr. Smith,29.000000,female,3,44.05
10,11,Mr. John,77.000000,female,3,247.52
...,...,...,...,...,...,...
186,187,Mr. John,43.318681,female,1,148.20
190,191,Miss Emma,68.000000,female,1,367.68
191,192,Miss Emma,78.000000,female,3,12.94
193,194,Dr. Smith,43.318681,female,1,25.29


In [None]:
#Grouping and Aggregation
df.groupby("Pclass")["Fare"].mean()

Pclass
1    237.222623
2    281.545775
3    232.200147
Name: Fare, dtype: float64

Dataset 2: Student Performance Dataset

In [11]:
np.random.seed(2)
rows = 150

df = pd.DataFrame({
    "StudentId": np.arange(1, rows + 1),
    "Age": np.random.randint(18, 30, rows),
    "Gender": np.random.choice(["male", "female"], rows),
    "Course": np.random.choice(["AI", "DS", "ML", "Python"], rows),
    "Marks": np.random.randint(35, 100, rows)
})

In [None]:
#Sorting Data
df.sort_values(by="Marks", ascending=False)

Unnamed: 0,StudentId,Age,Gender,Course,Marks
140,141,26,female,DS,99
47,48,26,female,DS,99
30,31,26,female,ML,99
145,146,25,female,ML,99
19,20,22,female,Python,98
...,...,...,...,...,...
110,111,29,female,ML,38
144,145,18,female,Python,37
40,41,28,female,ML,37
14,15,22,female,ML,35


In [None]:
#Value Counts
df["Gender"].value_counts()

Gender
female    84
male      66
Name: count, dtype: int64

In [None]:
#Apply Function
df["Result"] = df["Marks"].apply(lambda x: "Pass" if x >= 50 else "Fail")

In [None]:
#Grouping
df.groupby("Course")["Marks"].mean()

Course
AI        68.185185
DS        64.611111
ML        62.125000
Python    68.282051
Name: Marks, dtype: float64

Dataset 3: Employee Dataset

In [None]:
np.random.seed(3)
rows = 120

df = pd.DataFrame({
    "EmpId": np.arange(1, rows + 1),
    "Dept": np.random.choice(["HR", "IT", "Sales"], rows),
    "Salary": np.random.randint(30000, 150000, rows),
    "Gender": np.random.choice(["male", "female"], rows)
})


In [None]:
#Rename and Drop
df.rename(columns={"Dept": "Department"}, inplace=True)
df.drop("Gender", axis=1, inplace=True)

In [None]:
#Aggregation
df.groupby("Department")["Salary"].agg(["mean", "max"])

Unnamed: 0_level_0,mean,max
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,85759.09434,149292
IT,92223.166667,145060
Sales,86745.483871,147177
