# From Quiz - Data Transformation in Python

In [2]:
import pandas as pd

In [3]:
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 30, 35, 29],
    'Score': [85, 90, 88, 92]
})

### Question: Which of the following filters rows where Age is greater than 30?

In [4]:
data[data["Age"]>30]

Unnamed: 0,Name,Age,Score
2,Charlie,35,88


In [5]:
data.loc[data["Age"]>30]

Unnamed: 0,Name,Age,Score
2,Charlie,35,88


In [6]:
# data.query("Age">30)  <--  this throw error STR > INT
data.query("Age > 30")  # <-- this way it works

Unnamed: 0,Name,Age,Score
2,Charlie,35,88


### Question: Which code selects only the Name and Score columns?

In [8]:
data[["Name", "Score"]]

Unnamed: 0,Name,Score
0,Alice,85
1,Bob,90
2,Charlie,88
3,David,92


In [9]:
data.loc[:,["Name","Score"]]   # means [all rows, [this columns]]  for all rows used ':'

Unnamed: 0,Name,Score
0,Alice,85
1,Bob,90
2,Charlie,88
3,David,92


In [10]:
data.loc[[1],["Name","Score"]] # for example

Unnamed: 0,Name,Score
1,Bob,90


In [11]:
data.iloc[:,[0,2]]

Unnamed: 0,Name,Score
0,Alice,85
1,Bob,90
2,Charlie,88
3,David,92


In [12]:
# You are given a DataFrame.
n_data = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'Charlie', 'David'], 'Score': [85, None, 88, 92] })
n_data

Unnamed: 0,Name,Score
0,Alice,85.0
1,Bob,
2,Charlie,88.0
3,David,92.0


### Question: Which code calculates the sum of Score across all rows?

In [14]:
data["Score"].sum()

np.int64(355)

In [15]:
data.loc[:,["Score"]].sum()

Score    355
dtype: int64

In [16]:
sum(data["Score"])

355

### Question: How can you add a new column called Age_Group where values are 'Youth' if Age < 30, otherwise 'Adult'?

In [17]:
data["Age_Group_A"] = ['Youth' if age < 30 else 'Adult' for age in data['Age']]
data

Unnamed: 0,Name,Age,Score,Age_Group_A
0,Alice,24,85,Youth
1,Bob,30,90,Adult
2,Charlie,35,88,Adult
3,David,29,92,Youth


In [18]:
data["Age_Group_B"] = data["Age"].apply(lambda age: 'Youth' if age < 30 else 'Adult')
data

Unnamed: 0,Name,Age,Score,Age_Group_A,Age_Group_B
0,Alice,24,85,Youth,Youth
1,Bob,30,90,Adult,Adult
2,Charlie,35,88,Adult,Adult
3,David,29,92,Youth,Youth


In [19]:
data["Age_Group_C"] = data.apply(lambda raw: 'Youth' if raw['Age'] < 30 else 'Adult', axis=1)
data

Unnamed: 0,Name,Age,Score,Age_Group_A,Age_Group_B,Age_Group_C
0,Alice,24,85,Youth,Youth,Youth
1,Bob,30,90,Adult,Adult,Adult
2,Charlie,35,88,Adult,Adult,Adult
3,David,29,92,Youth,Youth,Youth


### Question: Which code sorts the rows in descending order of Score?

In [20]:
data.sort_values(by="Score", ascending=False)

Unnamed: 0,Name,Age,Score,Age_Group_A,Age_Group_B,Age_Group_C
3,David,29,92,Youth,Youth,Youth
1,Bob,30,90,Adult,Adult,Adult
2,Charlie,35,88,Adult,Adult,Adult
0,Alice,24,85,Youth,Youth,Youth


In [21]:
data.loc[:, 'Score'].sort_values(ascending=False)

3    92
1    90
2    88
0    85
Name: Score, dtype: int64

### Question: Which code calculates the mean of Values grouped by Category?

In [22]:
cat_data = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'B', 'A'],
    'Values': [10, 20, 15, 25, 10]
})
cat_data

Unnamed: 0,Category,Values
0,A,10
1,B,20
2,A,15
3,B,25
4,A,10


In [23]:
cat_data.groupby("Category").mean()

Unnamed: 0_level_0,Values
Category,Unnamed: 1_level_1
A,11.666667
B,22.5


In [24]:
cat_data.groupby(['Category']).agg('mean')

Unnamed: 0_level_0,Values
Category,Unnamed: 1_level_1
A,11.666667
B,22.5


In [25]:
cat_data.groupby('Category')['Values'].mean()

Category
A    11.666667
B    22.500000
Name: Values, dtype: float64

### Question: Which code fills missing values in the Score column with 0?

In [26]:
temp_data = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'Charlie', 'David'], 'Score': [85, None, 88, 92] })
temp_data

Unnamed: 0,Name,Score
0,Alice,85.0
1,Bob,
2,Charlie,88.0
3,David,92.0


In [28]:
temp_data.fillna({'Score': 0})  # this works but not change in real data frame
temp_data

Unnamed: 0,Name,Score
0,Alice,85.0
1,Bob,
2,Charlie,88.0
3,David,92.0


In [29]:
temp_data['Score'].fillna(0)  # this works but not change in real data frame
temp_data

Unnamed: 0,Name,Score
0,Alice,85.0
1,Bob,
2,Charlie,88.0
3,David,92.0


In [30]:
temp_data['Score'] = temp_data['Score'].fillna(0) # this works and update in real data frame
temp_data

Unnamed: 0,Name,Score
0,Alice,85.0
1,Bob,0.0
2,Charlie,88.0
3,David,92.0
