In [112]:
# DATA WRANGLING
import pandas as pd



In [113]:
data = {
    "Student": ["Amina", "Brian", "Cathy", "Derrick", "Edna"],
    "Age": [21, None, 23, 22, None],
    "GPA": [3.0, 3.2, None, 3.5, 3.1]
}
df = pd.DataFrame(data)


In [114]:
df

Unnamed: 0,Student,Age,GPA
0,Amina,21.0,3.0
1,Brian,,3.2
2,Cathy,23.0,
3,Derrick,22.0,3.5
4,Edna,,3.1


In [115]:
# Drop rows with any missing values
without_values= df.dropna()




In [116]:
without_values

Unnamed: 0,Student,Age,GPA
0,Amina,21.0,3.0
3,Derrick,22.0,3.5


In [117]:
#resulting DataFrame.
df

Unnamed: 0,Student,Age,GPA
0,Amina,21.0,3.0
1,Brian,,3.2
2,Cathy,23.0,
3,Derrick,22.0,3.5
4,Edna,,3.1


In [118]:
#exercise 2.1.2
# fill with mean or median.
#a)
df_copy = df.copy()
df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)


print(df_copy)


   Student   Age  GPA
0    Amina  21.0  3.0
1    Brian  22.0  3.2
2    Cathy  23.0  NaN
3  Derrick  22.0  3.5
4     Edna  22.0  3.1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)


In [119]:
# b)	Fill missing GPA with the median GPA
df_copy['GPA'].fillna(df_copy['GPA'].median(), inplace=True)

print(df_copy)


   Student   Age   GPA
0    Amina  21.0  3.00
1    Brian  22.0  3.20
2    Cathy  23.0  3.15
3  Derrick  22.0  3.50
4     Edna  22.0  3.10


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['GPA'].fillna(df_copy['GPA'].median(), inplace=True)


In [122]:
#the updated DataFrame
df_copy

Unnamed: 0,Student,Age,GPA
0,Amina,21.0,3.0
1,Brian,22.0,3.2
2,Cathy,23.0,3.15
3,Derrick,22.0,3.5
4,Edna,22.0,3.1


In [124]:
#Exercise 2.1.3: Check and Count Nulls
missing_counts = df.isnull().sum()

print(missing_counts)


Student    0
Age        2
GPA        1
dtype: int64


In [None]:
# age column has the most missing values
# Action: Fill with appropriate strategy based on data type(mean age)


In [2]:
# EXERCISE 2.2.1:Detect Outliers

import pandas as pd

df = pd.DataFrame({
    "Salary": [4000, 4200, 4100, 80000, 4300, 4150]
})

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

# Determine the outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df[(df["Salary"] < lower_bound) | (df["Salary"] > upper_bound)]
print("Outliers:\n", outliers)

Outliers:
    Salary
3   80000


In [5]:
#Exercise 2.2.2: Correct Implausible Values
df = pd.DataFrame({
    "Age": [25, 30, -5, 1000, 40]
})
print(df)

    Age
0    25
1    30
2    -5
3  1000
4    40


In [8]:
#•	Replace negative and >120 values in the Age column with the median age
import pandas as pd

df = pd.DataFrame({
    "Age": [25, 30, -5, 1000, 40]
})

# Identify valid ages (between 0 and 120 inclusive)
valid_ages = df["Age"][(df["Age"] >= 0) & (df["Age"] <= 120)]

# Calculate the median of valid ages
median_age = valid_ages.median()

# Replace invalid ages with the median
df["Age"] = df["Age"].apply(lambda x: median_age if x < 0 or x > 120 else x)

print(df)

    Age
0  25.0
1  30.0
2  30.0
3  30.0
4  40.0


In [9]:
#Exercise 2.2.3: Format Correction
import pandas as pd

df = pd.DataFrame({
    "Age": ["25", "30", "Twenty-Five", "45", "32"]
})

# Convert valid string numbers to integers, invalid strings to NaN
df["Age"] = pd.to_numeric(df["Age"], errors='coerce')

print(df)

    Age
0  25.0
1  30.0
2   NaN
3  45.0
4  32.0


In [10]:
#2.3. Transforming Quantitative Variables
#Exercise 2.3.1: Normalize Values
import pandas as pd

df = pd.DataFrame({
    "Income": [20000, 30000, 25000, 40000]
})

# Min-max normalization
df["Normalized_Income"] = (df["Income"] - df["Income"].min()) / (df["Income"].max() - df["Income"].min())

print(df)

   Income  Normalized_Income
0   20000               0.00
1   30000               0.50
2   25000               0.25
3   40000               1.00


In [11]:
#Exercise 2.3.2: Convert to Categories
import pandas as pd

df = pd.DataFrame({
    "Age": [5, 20, 67, 15, 40, 80]
})

# Define bins and labels
bins = [0, 11, 59, float('inf')]
labels = ['Child', 'Adult', 'Senior']

# Apply categorization
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

print(df)

   Age Age_Group
0    5     Child
1   20     Adult
2   67    Senior
3   15     Adult
4   40     Adult
5   80    Senior


In [16]:
#Exercise 2.3.3: Simplify Large Values
import pandas as pd

df = pd.DataFrame({
    "Income": [20000, 30000, 25000, 40000]
})
df["Income_k"] = df["Income"] / 1000
print(df)

   Income  Income_k
0   20000      20.0
1   30000      30.0
2   25000      25.0
3   40000      40.0
