In [2]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(1)
rows=200

In [4]:
df = pd.DataFrame({
    "PassengerId": np.arange(1, rows + 1),
    "Name": np.random.choice(["Mr. John", "Mrs. Anna", "Miss Emma", "Dr. Smith"], rows),
    "Age": np.random.choice(np.append(np.random.randint(1, 80, 180), [np.nan]*20), rows),
    "Sex": np.random.choice(["male", "female"], rows),
    "Pclass": np.random.choice([1, 2, 3], rows),
    "Fare": np.round(np.random.uniform(10, 500, rows), 2)
})

In [5]:
df

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
0,1,Mrs. Anna,40.0,male,2,461.14
1,2,Dr. Smith,,male,1,333.43
2,3,Mr. John,75.0,male,1,348.14
3,4,Mr. John,63.0,female,2,92.36
4,5,Dr. Smith,39.0,female,1,346.02
...,...,...,...,...,...,...
195,196,Miss Emma,40.0,male,2,136.23
196,197,Dr. Smith,33.0,female,1,300.03
197,198,Mrs. Anna,75.0,male,3,53.22
198,199,Mrs. Anna,,male,2,431.91


In [6]:
df.head()

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
0,1,Mrs. Anna,40.0,male,2,461.14
1,2,Dr. Smith,,male,1,333.43
2,3,Mr. John,75.0,male,1,348.14
3,4,Mr. John,63.0,female,2,92.36
4,5,Dr. Smith,39.0,female,1,346.02


In [7]:
df.tail()

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
195,196,Miss Emma,40.0,male,2,136.23
196,197,Dr. Smith,33.0,female,1,300.03
197,198,Mrs. Anna,75.0,male,3,53.22
198,199,Mrs. Anna,,male,2,431.91
199,200,Miss Emma,33.0,male,3,45.51


In [8]:
df.sample(3)

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
128,129,Mr. John,18.0,male,3,338.25
175,176,Mr. John,33.0,female,3,329.26
41,42,Mrs. Anna,36.0,male,1,47.32


In [9]:
df.shape

(200, 6)

In [10]:
df.columns

Index(['PassengerId', 'Name', 'Age', 'Sex', 'Pclass', 'Fare'], dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  200 non-null    int64  
 1   Name         200 non-null    object 
 2   Age          182 non-null    float64
 3   Sex          200 non-null    object 
 4   Pclass       200 non-null    int64  
 5   Fare         200 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 9.5+ KB


In [12]:
df.describe()

Unnamed: 0,PassengerId,Age,Pclass,Fare
count,200.0,182.0,200.0,200.0
mean,100.5,43.318681,2.035,251.2497
std,57.879185,22.078153,0.804369,146.847353
min,1.0,1.0,1.0,11.25
25%,50.75,25.25,1.0,115.55
50%,100.5,41.0,2.0,265.43
75%,150.25,63.0,3.0,371.4325
max,200.0,79.0,3.0,493.68


In [13]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Name,0
Age,18
Sex,0
Pclass,0
Fare,0


In [15]:
df["Age"].fillna(df["Age"].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)


In [16]:
df[df["Age"] > 30]
df[df["Sex"] == "female"]

Unnamed: 0,PassengerId,Name,Age,Sex,Pclass,Fare
3,4,Mr. John,63.000000,female,2,92.36
4,5,Dr. Smith,39.000000,female,1,346.02
5,6,Mrs. Anna,26.000000,female,2,304.88
8,9,Dr. Smith,29.000000,female,3,44.05
10,11,Mr. John,77.000000,female,3,247.52
...,...,...,...,...,...,...
186,187,Mr. John,43.318681,female,1,148.20
190,191,Miss Emma,68.000000,female,1,367.68
191,192,Miss Emma,78.000000,female,3,12.94
193,194,Dr. Smith,43.318681,female,1,25.29


In [17]:
df.groupby("Pclass")["Fare"].mean()

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,237.222623
2,281.545775
3,232.200147


In [18]:
### Dataset 2
np.random.seed(2)
rows=150

In [19]:
df = pd.DataFrame({
    "StudentId": np.arange(1, rows + 1),
    "Age": np.random.randint(18, 30, rows),
    "Gender": np.random.choice(["male", "female"], rows),
    "Course": np.random.choice(["AI", "DS", "ML", "Python"], rows),
    "Marks": np.random.randint(35, 100, rows)
})


In [20]:
df

Unnamed: 0,StudentId,Age,Gender,Course,Marks
0,1,26,male,AI,65
1,2,26,female,DS,41
2,3,24,male,DS,56
3,4,29,female,ML,52
4,5,20,female,AI,72
...,...,...,...,...,...
145,146,25,female,ML,99
146,147,18,female,DS,64
147,148,23,female,AI,68
148,149,29,male,ML,83


In [21]:
df.sort_values(by="Marks", ascending=False)

Unnamed: 0,StudentId,Age,Gender,Course,Marks
47,48,26,female,DS,99
30,31,26,female,ML,99
140,141,26,female,DS,99
145,146,25,female,ML,99
19,20,22,female,Python,98
...,...,...,...,...,...
110,111,29,female,ML,38
40,41,28,female,ML,37
144,145,18,female,Python,37
14,15,22,female,ML,35


In [22]:
df["Gender"].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
female,84
male,66


In [23]:
df["Result"] = df["Marks"].apply(lambda x: "Pass" if x >= 50 else "Fail")

In [24]:
df.groupby("Course")["Marks"].mean()

Unnamed: 0_level_0,Marks
Course,Unnamed: 1_level_1
AI,68.185185
DS,64.611111
ML,62.125
Python,68.282051


In [25]:
## Dataset 3
np.random.seed(3)
rows=100

In [26]:
df = pd.DataFrame({
    "EmpId": np.arange(1, rows + 1),
    "Dept": np.random.choice(["HR", "IT", "Sales"], rows),
    "Salary": np.random.randint(30000, 150000, rows),
    "Gender": np.random.choice(["male", "female"], rows)
})

In [27]:
df

Unnamed: 0,EmpId,Dept,Salary,Gender
0,1,Sales,70552,male
1,2,HR,104023,female
2,3,IT,36429,male
3,4,HR,97248,female
4,5,HR,121145,male
...,...,...,...,...
95,96,HR,144135,male
96,97,IT,81105,male
97,98,IT,70316,male
98,99,IT,65157,female


In [28]:
df.rename(columns={"Dept": "Department"}, inplace=True)
df.drop("Gender", axis=1, inplace=True)

In [29]:
df.groupby("Department")["Salary"].agg(["mean", "max"])

Unnamed: 0_level_0,mean,max
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,92193.230769,145060
IT,84817.242424,149292
Sales,95263.178571,147280
