In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
np.random.seed(1)
rows = 200

DATASET 1: Titanic-Like Passenger Dataset
Purpose: Learn data inspection, filtering, missing values, grouping

In [17]:
df = pd.DataFrame({"passengerid":np.arange(1,rows+1),
                   "Name":np.random.choice(["Mr.John", "Mrs.Anna", "Miss Emma", "Dr.Smith"],rows),
                   "Age":np.random.choice(np.append(np.random.randint(1,80,180),[np.nan]*20),rows),
                   "Sex": np.random.choice(["male", "female"], rows),
    "Pclass": np.random.choice([1, 2, 3], rows),
                   "Fare": np.round(np.random.uniform(10,500,rows),2)})
df.head()

Unnamed: 0,passengerid,Name,Age,Sex,Pclass,Fare
0,1,Dr.Smith,30.0,male,1,288.03
1,2,Miss Emma,47.0,male,1,262.49
2,3,Miss Emma,43.0,female,3,149.03
3,4,Mrs.Anna,68.0,female,3,215.02
4,5,Dr.Smith,56.0,female,3,256.26


In [18]:
df.tail()

Unnamed: 0,passengerid,Name,Age,Sex,Pclass,Fare
195,196,Dr.Smith,36.0,female,3,495.08
196,197,Mrs.Anna,19.0,male,1,491.18
197,198,Mrs.Anna,24.0,male,2,296.65
198,199,Miss Emma,34.0,female,1,54.93
199,200,Miss Emma,25.0,male,2,166.09


In [19]:
df.sample(3)

Unnamed: 0,passengerid,Name,Age,Sex,Pclass,Fare
17,18,Miss Emma,32.0,male,2,176.03
6,7,Miss Emma,52.0,male,2,203.97
173,174,Mr.John,47.0,male,1,257.52


In [20]:
df.shape   # Number of rows and columns

(200, 6)

In [21]:
df.columns

Index(['passengerid', 'Name', 'Age', 'Sex', 'Pclass', 'Fare'], dtype='object')

In [22]:
df.info()     # Data types and missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  200 non-null    int64  
 1   Name         200 non-null    object 
 2   Age          183 non-null    float64
 3   Sex          200 non-null    object 
 4   Pclass       200 non-null    int64  
 5   Fare         200 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 9.5+ KB


In [23]:
df.describe()   # Statistical summary

Unnamed: 0,passengerid,Age,Pclass,Fare
count,200.0,183.0,200.0,200.0
mean,100.5,41.31694,2.045,263.36475
std,57.879185,21.686784,0.84054,138.911476
min,1.0,2.0,1.0,12.01
25%,50.75,23.0,1.0,151.195
50%,100.5,41.0,2.0,256.89
75%,150.25,62.0,3.0,382.6825
max,200.0,79.0,3.0,497.46


Handle missing values

In [24]:
df.isnull().sum()     #count missing values

passengerid     0
Name            0
Age            17
Sex             0
Pclass          0
Fare            0
dtype: int64

In [25]:
df["Age"].fillna(df["Age"].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace = True)


Filtering Rows

In [27]:
df[df["Age"]>30]
df[df["Sex"]=="female"]

Unnamed: 0,passengerid,Name,Age,Sex,Pclass,Fare
2,3,Miss Emma,43.0,female,3,149.03
3,4,Mrs.Anna,68.0,female,3,215.02
4,5,Dr.Smith,56.0,female,3,256.26
5,6,Mrs.Anna,15.0,female,1,280.83
10,11,Mrs.Anna,68.0,female,2,470.21
...,...,...,...,...,...,...
190,191,Mr.John,63.0,female,2,463.44
193,194,Mr.John,51.0,female,2,16.59
194,195,Dr.Smith,27.0,female,3,469.48
195,196,Dr.Smith,36.0,female,3,495.08


Grouping & Aggregation

In [28]:
df.groupby("Pclass")["Fare"].mean()

Pclass
1    251.155152
2    279.209661
3    261.644533
Name: Fare, dtype: float64

DATASET 2: Student Performance Dataset
Purpose: Sorting, value counts, apply, conditional columns

In [30]:
np.random.seed(2)
rows = 150

In [31]:
df = pd.DataFrame({
    "Studentid": np.arange(1,rows+1),
    "Age": np.random.randint(18,30,rows),
    "Gender": np.random.choice(["male","female"],rows),
    "Course": np.random.choice(["Ai","DS","ML","Python"],rows),

    "Marks":np.random.randint(35,100,rows)
})

Sorting Data

In [32]:
df.sort_values(by= "Marks", ascending=False)

Unnamed: 0,Studentid,Age,Gender,Course,Marks
47,48,26,female,DS,99
30,31,26,female,ML,99
140,141,26,female,DS,99
145,146,25,female,ML,99
19,20,22,female,Python,98
...,...,...,...,...,...
110,111,29,female,ML,38
40,41,28,female,ML,37
144,145,18,female,Python,37
14,15,22,female,ML,35


Value Counts

In [33]:
df["Gender"].value_counts()

Gender
female    84
male      66
Name: count, dtype: int64

Apply Function

In [34]:
df["Result"] = df["Marks"].apply(lambda x: "Pass" if x >= 50 else "Fail")

Grouping
 

In [35]:
df.groupby("Course")["Marks"].mean()

Course
Ai        68.185185
DS        64.611111
ML        62.125000
Python    68.282051
Name: Marks, dtype: float64

DATASET 3: Employee Dataset
 
Purpose: Rename, drop, mapping, aggregation

In [37]:
np.random.seed(3)
rows = 120

In [42]:
df = pd.DataFrame({
    "Empid" : np.arange(1,rows+1),
    "Dept": np.random.choice(["HR", "IT", "Sales"], rows),
    "Salary": np.random.randint(30000, 150000, rows),    "Gender": np.random.choice(["male","female"],rows)
})

Rename & Drop

In [43]:
df.rename(columns={"Dept": "Department"}, inplace=True)
df.drop("Gender", axis=1, inplace=True)

Aggregation

In [44]:
df.groupby("Department")["Salary"].agg(["mean", "max"])

Unnamed: 0_level_0,mean,max
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,90146.578947,147389
IT,91684.5,142926
Sales,101017.0625,148530
