In [1]:
import numpy as np
import pandas as pd

In [2]:
# | Purpose         | Code (New) | Example Output                   |
# |------------------|------------|----------------------------------|
# | Daily            | 'D'        | 2024-01-01, 2024-01-02, ...      |
# | Month-End        | 'ME'       | 2024-01-31, 2024-02-29, ...      |
# | Quarter-End      | 'QE-DEC'   | 2024-03-31, 2024-06-30, ...      |
# | Year-End         | 'YE'       | 2024-12-31, 2025-12-31, ...      |
# | Hourly           | 'H'        | 2024-01-01 00:00, 2024-01-01 01:00 |

dates = pd.date_range(start="2025-01-01", periods=10, freq='D')
df_ts = pd.DataFrame({"date":dates, "sales": [100, 120, 90, 150, 200, 130, 170, 180, 160, 190]})
df_ts

Unnamed: 0,date,sales
0,2025-01-01,100
1,2025-01-02,120
2,2025-01-03,90
3,2025-01-04,150
4,2025-01-05,200
5,2025-01-06,130
6,2025-01-07,170
7,2025-01-08,180
8,2025-01-09,160
9,2025-01-10,190


Date Parsing

In [3]:
# Convert column to datetime format:
df_ts["date"] = pd.to_datetime(df_ts["date"])
df_ts.set_index('date', inplace=True)
print(df_ts)

            sales
date             
2025-01-01    100
2025-01-02    120
2025-01-03     90
2025-01-04    150
2025-01-05    200
2025-01-06    130
2025-01-07    170
2025-01-08    180
2025-01-09    160
2025-01-10    190


Resampling (change frequency of time series)

In [4]:
# Use .resample() to aggregate by different time periods.
# Daily to Weekly (mean):
df_ts.resample('W').sum()

# Daily to Monthly (sum):
# df_ts.resample('M').sum()

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2025-01-05,660
2025-01-12,830


Rolling Statistics

In [5]:
# Use .rolling() to apply moving windows (e.g., moving average).

# Note : The first (window - 1) rows will be NaN

# 3-day rolling mean:
df_ts['sales'].rolling(window=3).mean()

# Rolling sum:
df_ts['sales'].rolling(window=3).sum()

# 3-day moving average of sales, resampled weekly
df_ts['sales'].resample('D').mean().rolling(window=2).mean()

# Rolling with min_periods and center
df_ts['rolling_avg'] = df_ts['sales'].rolling(window=3, min_periods=2, center=True).mean()
print(df_ts)

            sales  rolling_avg
date                          
2025-01-01    100   110.000000
2025-01-02    120   103.333333
2025-01-03     90   120.000000
2025-01-04    150   146.666667
2025-01-05    200   160.000000
2025-01-06    130   166.666667
2025-01-07    170   160.000000
2025-01-08    180   170.000000
2025-01-09    160   176.666667
2025-01-10    190   175.000000


Pivot Tables & Cross-Tabulations in Pandas

In [6]:
# Pivot Table (pivot_table())
# A pivot table is used to summarize and aggregate data based on multiple dimensions.

# Syntax:
# pd.pivot_table(data, index=..., columns=..., values=..., aggfunc=...)

df = pd.DataFrame({
    'Employee': ['Alice', 'Bob', 'Alice', 'David', 'Bob', 'Alice'],
    'Department': ['HR', 'IT', 'HR', 'Finance', 'IT', 'HR'],
    'Month': ['Jan', 'Jan', 'Feb', 'Jan', 'Feb', 'Mar'],
    'Sales': [1000, 1500, 1200, 1300, 1700, 900]
})

# Example 1: Total sales per employee
pd.pivot_table(df, index="Employee", values="Sales", aggfunc="sum")
pd.pivot_table(df, index="Employee", values="Sales", aggfunc="mean")

# Example 2: Sales by Employee and Month
pd.pivot_table(df, index='Employee', columns='Month', values='Sales', aggfunc='sum', fill_value=0)

# Example 3: Average sales by Department
pd.pivot_table(df, index='Department', values='Sales', aggfunc='mean')

Unnamed: 0_level_0,Sales
Department,Unnamed: 1_level_1
Finance,1300.0
HR,1033.333333
IT,1600.0


In [7]:
# Cross-Tabulation (pd.crosstab())
# A cross-tab (like an Excel contingency table) is used to compute frequency counts between two or more categories.

df = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Male'],
    'Department': ['IT', 'HR', 'Finance', 'Finance', 'IT', 'HR']
})

# Example 1: Frequency of gender in departments
pd.crosstab(df['Department'], df['Gender'])

# Example 2: With margins (totals)
pd.crosstab(df['Department'], df['Gender'], margins=True)

# Example 3: Normalize (percent by row)
pd.crosstab(df['Department'], df['Gender'], normalize='index')

Gender,Female,Male
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,0.5,0.5
HR,0.5,0.5
IT,0.0,1.0


Window Functions in Pandas

In [8]:
data = {
    'Day': pd.date_range(start='2024-01-01', periods=7, freq='D'),
    'Sales': [100, 120, 90, 150, 200, 130, 170]
}

df = pd.DataFrame(data).set_index('Day')
# df

# .rolling() — Moving Window
# df["Sales"].rolling(window=3).mean()
# df["Sales"].rolling(window=3).max()
# df["Sales"].rolling(window=3).std()

# .expanding() — Cumulative from the start
df["Expanding Mean"] = df['Sales'].expanding().mean()

#? .ewm() — Exponential Weighted Average
# With smoothing factor (alpha):
df['Sales'].ewm(alpha=0.5).mean()

Day
2024-01-01    100.000000
2024-01-02    113.333333
2024-01-03    100.000000
2024-01-04    126.666667
2024-01-05    164.516129
2024-01-06    146.984127
2024-01-07    158.582677
Name: Sales, dtype: float64

Functions in Pandas
These allow you to apply custom logic to your data using functions or lambda.

In [23]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score1': [85, 92, 78],
    'Score2': [88, 90, 82]
})
df

Unnamed: 0,Name,Score1,Score2
0,Alice,85,88
1,Bob,92,90
2,Charlie,78,82


In [16]:
# apply() – Apply a function row-wise or column-wise
# Apply function across a column:
# df["Score1"].apply(lambda x : x * 2)

# Apply across rows (axis=1):
# df["Total"] = df.apply(lambda row:row["Score1"] + row["Score2"], axis=1)

In [22]:
# map() – Element-wise for a Series
# Map a function:
# df["Name"].map(lambda name:name.upper())

# Map a dictionary:
grade_map = {85: 'B', 92: 'A', 78: 'C'}
# df.drop(columns=["Score2"], inplace=True)
df['Grade1'] = df['Score1'].map(grade_map)
df

Unnamed: 0,Name,Score1,Grade1
0,Alice,85,B
1,Bob,92,A
2,Charlie,78,C


In [27]:
# applymap() – Apply function to entire DataFrame
# Only for DataFrames (not Series)
# df[['Score1', 'Score2']].applymap(lambda x: x * 1.1) # DataFrame.applymap has been deprecated

df[['Score1', 'Score2']] = df[['Score1', 'Score2']].apply(lambda col: col.map(lambda x: x * 1.1))
df

Unnamed: 0,Name,Score1,Score2
0,Alice,113.135,117.128
1,Bob,122.452,119.79
2,Charlie,103.818,109.142


## String Operations in Pandas
Pandas provides the .str accessor to work with string values in a Series (usually columns with text).

In [52]:
# | Method              | Description                           | Example                                                 |
# | ------------------- | ------------------------------------- | ------------------------------------------------------- |
# | `.str.lower()`      | Convert to lowercase                  | `df['Name'].str.lower()`                                |
# | `.str.upper()`      | Convert to uppercase                  | `df['Name'].str.upper()`                                |
# | `.str.title()`      | Capitalize first letter of each word  | `df['Name'].str.title()`                                |
# | `.str.strip()`      | Remove leading/trailing whitespace    | `df['Name'].str.strip()`                                |
# | `.str.contains()`   | Check for substring (returns boolean) | `df['Email'].str.contains('gmail')`                     |
# | `.str.startswith()` | Check if string starts with...        | `df['Email'].str.startswith('alice')`                   |
# | `.str.endswith()`   | Check if string ends with...          | `df['Email'].str.endswith('.com')`                      |
# | `.str.replace()`    | Replace substring                     | `df['Email'].str.replace('@gmail.com', '@example.com')` |
# | `.str.len()`        | Get string length                     | `df['Name'].str.len()`                                  |
# | `.str.split()`      | Split by delimiter (returns list)     | `df['Email'].str.split('@')`                            |
# | ------------------- | ------------------------------------- | ------------------------------------------------------- |
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'charlie', 'DAVID'],
    'Email': ['alice@gmail.com', 'bob@outlook.com', 'charlie@gmail.com', 'david@yahoo.com']
})
df

Unnamed: 0,Name,Email
0,Alice,alice@gmail.com
1,Bob,bob@outlook.com
2,charlie,charlie@gmail.com
3,DAVID,david@yahoo.com


In [53]:
# Convert all names to lowercase
df["Name"] = df["Name"].str.lower()

# Filter rows where email is Gmail
# print(df[df["Email"].str.contains("gmail")])

# Replace Gmail with company domain
# df['Email'] = df['Email'].str.replace('@gmail.com', '@company.com')

# Extract domain from email

# df["Domain"] = df["Email"].str.split("@").str[1]

# 📌 Note on .str.contains()
# To avoid case issues:
df['Email'].str.contains('gmail', case=False)

# To avoid regex errors:
# df['Email'].str.contains('gmail', regex=False)


df

Unnamed: 0,Name,Email
0,alice,alice@gmail.com
1,bob,bob@outlook.com
2,charlie,charlie@gmail.com
3,david,david@yahoo.com
