In [1]:
import pandas as pd
import numpy as np

In [4]:
# 1. Merging with the Indicator Argument
df1 = pd.DataFrame(data={"key": ["A", "B", "C"], "value": [1, 2, 3]})
df2 = pd.DataFrame(data={"key": ["B", "C", "D"], "value": [4, 5, 6]})

merged: pd.DataFrame = pd.merge(
    left=df1, right=df2, on="key", how="outer", indicator=True
)

print(merged)


  key  value_x  value_y      _merge
0   A      1.0      NaN   left_only
1   B      2.0      4.0        both
2   C      3.0      5.0        both
3   D      NaN      6.0  right_only


In [7]:
# 2. Custom chaining with pipe
df: pd.DataFrame = pd.DataFrame(
    data={
        "Quantity": [10, 15, 10, 20],
        "Price": [100, 150, 200, 250],
    }
)


def calc_total(df) -> pd.DataFrame:
    df["Total"] = df["Quantity"] * df["Price"]
    return df


result: pd.DataFrame = (
    df.pipe(func=calc_total)
    .query(expr="Total > 100")
    .sort_values(by="Total", ascending=False)
)

# Sorting index
index_range: pd.RangeIndex = pd.RangeIndex(start=1, stop=len(result) + 1, step=1)
result.index = index_range
print(result)

price = np.arange(100, 250, step=50, dtype=np.int32)
print(price)

   Quantity  Price  Total
1        20    250   5000
2        15    150   2250
3        10    200   2000
4        10    100   1000
[100 150 200]


In [8]:
# 3. Identify Duplicates and Drop Duplicates
df = pd.DataFrame(
    data={
        "ID": [1, 2, 2, 3, 4, 4],
        "Name": ["Alice", "Bob", "Bob", "Charlie", "David", "David"],
    }
)
print(df)

duplicated: pd.DataFrame = df[df.duplicated(subset="ID", keep=False)]
print(duplicated)

drop_duplicates: pd.DataFrame = df.drop_duplicates(subset="ID")
print(drop_duplicates)


   ID     Name
0   1    Alice
1   2      Bob
2   2      Bob
3   3  Charlie
4   4    David
5   4    David
   ID   Name
1   2    Bob
2   2    Bob
4   4  David
5   4  David
   ID     Name
0   1    Alice
1   2      Bob
3   3  Charlie
4   4    David


In [9]:
# 4. Binning Data with cut and qcut
data: dict[str, list[int]] = {
    "Age": [22, 25, 29, 34, 45, 52, 61, 70, 80, 90],
    "Income": [25000, 27000, 30000, 32000, 40000, 50000, 60000, 70000, 80000, 90000],
}
df = pd.DataFrame(data=data)
print(df)

# Equal-width binning for Age
age_bins: list[int] = [0, 18, 35, 60, 100]
age_labels: list[str] = ["Child", "Young Adult", "Adult", "Senior"]
df["Age Group"] = pd.cut(x=df["Age"], bins=age_bins, labels=age_labels)
print(df)

# Quantile-based binning for Income
df["Income Quartile"] = pd.qcut(df["Income"], 4, labels=["Q1", "Q2", "Q3", "Q4"])
print(df)


   Age  Income
0   22   25000
1   25   27000
2   29   30000
3   34   32000
4   45   40000
5   52   50000
6   61   60000
7   70   70000
8   80   80000
9   90   90000
   Age  Income    Age Group
0   22   25000  Young Adult
1   25   27000  Young Adult
2   29   30000  Young Adult
3   34   32000  Young Adult
4   45   40000        Adult
5   52   50000        Adult
6   61   60000       Senior
7   70   70000       Senior
8   80   80000       Senior
9   90   90000       Senior
   Age  Income    Age Group Income Quartile
0   22   25000  Young Adult              Q1
1   25   27000  Young Adult              Q1
2   29   30000  Young Adult              Q1
3   34   32000  Young Adult              Q2
4   45   40000        Adult              Q2
5   52   50000        Adult              Q3
6   61   60000       Senior              Q3
7   70   70000       Senior              Q4
8   80   80000       Senior              Q4
9   90   90000       Senior              Q4


In [10]:
# 5. Interpolating Data
df: pd.DataFrame = pd.DataFrame(
    data={
        "Date": pd.date_range(start="1/1/2024", periods=5, freq="D"),
        "Value": [1, np.nan, np.nan, 4, 5],
    }
)
print(df)

df["Interpolated"] = df["Value"].interpolate(method="linear")
print(df)


        Date  Value
0 2024-01-01    1.0
1 2024-01-02    NaN
2 2024-01-03    NaN
3 2024-01-04    4.0
4 2024-01-05    5.0
        Date  Value  Interpolated
0 2024-01-01    1.0           1.0
1 2024-01-02    NaN           2.0
2 2024-01-03    NaN           3.0
3 2024-01-04    4.0           4.0
4 2024-01-05    5.0           5.0
