# Imports

In [2]:
import pandas as pd

# Topics

## Chaining operations in Pandas

In [2]:
# Sample DataFrame
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 34, 45, 32],
        'Income': [50000, 60000, 80000, 75000]}
df = pd.DataFrame(data)

# Method chaining example: Filtering and sorting data
result = (
    df
    .loc[df['Age'] > 30]  # Filter rows where Age > 30
    .sort_values(by='Income', ascending=False)  # Sort by Income in descending order
)



In [3]:
result

Unnamed: 0,Name,Age,Income
2,Peter,45,80000
3,Linda,32,75000
1,Anna,34,60000


## Groupby operations in Pandas

In [4]:
data = {'Category': ['A', 'B', 'A', 'B', 'A'],
        'Value': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Grouping data by 'Category' and calculating the sum of 'Value'
grouped_data = df.groupby('Category').sum()

In [5]:
grouped_data

Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
A,90
B,60


## Commanding Time Series Data with Finesse

In [5]:

# Sample time series data
date_range = pd.date_range(start='1/1/2023', end='12/31/2028', freq='D')
traffic_data = pd.Series(range(len(date_range)), index=date_range)
print(traffic_data[:5])


2023-01-01    0
2023-01-02    1
2023-01-03    2
2023-01-04    3
2023-01-05    4
Freq: D, dtype: int64


In [6]:
# Resampling and frequency conversion for monthly analysis
yearly_traffic = traffic_data.resample('YE').sum()
print(yearly_traffic)

2023-12-31     66430
2024-12-31    200385
2025-12-31    333245
2026-12-31    466470
2027-12-31    599695
2028-12-31    735111
Freq: YE-DEC, dtype: int64


*we resampled the data into monthly intervals using the resample function*
*having calculated the sum of traffic for each month*

## Multi-level indexing in Pandas

In [7]:
index = pd.MultiIndex.from_tuples(
    [('A', 1), ('A', 2), ('B', 1), ('B', 2)],
    names=['Category', 'Number']
)

df = pd.DataFrame({
    'Value': [10, 20, 30, 40]
}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Category,Number,Unnamed: 2_level_1
A,1,10
A,2,20
B,1,30
B,2,40


In [9]:
# or

data = {
    'Category': ['A', 'A', 'B', 'B'],
    'Number': [1, 2, 1, 2],
    'Value': [10, 20, 30, 40]
}
df = pd.DataFrame(data)
df.set_index(['Category', 'Number'], inplace=True)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Category,Number,Unnamed: 2_level_1
A,1,10
A,2,20
B,1,30
B,2,40


In [10]:
print(df.swaplevel())

                 Value
Number Category       
1      A            10
2      A            20
1      B            30
2      B            40


In [11]:
print(df.reset_index())

  Category  Number  Value
0        A       1     10
1        A       2     20
2        B       1     30
3        B       2     40


In [14]:
print(df.loc['A']) 


print(df.loc[('A', 1)])

        Value
Number       
1          10
2          20
Value    10
Name: (A, 1), dtype: int64


In [13]:
print(df.groupby(level=['Category']).sum())

          Value
Category       
A            30
B            70


In [15]:
print(df.groupby(level=['Number']).sum())

        Value
Number       
1          40
2          60


## Pandas to Markdown 

In [3]:
df = pd.DataFrame({'a': [1, 2, 3, 4],
                   'b': [5, 6, 7, 8]})

# You can control the printing of the index column by using the flag index.
# make sure to pip install tabulate
print(df.to_markdown(index=True))

|    |   a |   b |
|---:|----:|----:|
|  0 |   1 |   5 |
|  1 |   2 |   6 |
|  2 |   3 |   7 |
|  3 |   4 |   8 |


How it looks like in MD 

|    |   a |   b |
|---:|----:|----:|
|  0 |   1 |   5 |
|  1 |   2 |   6 |
|  2 |   3 |   7 |
|  3 |   4 |   8 |

## Group Rows into list

In [6]:
df = pd.DataFrame(
    {
        "col1": [1, 2, 3, 4, 3],
        "col2": ["a", "a", "b", "b", "c"],
        "col3": ["d", "e", "f", "g", "h"],
    }
)

# Group by col2
print(df.groupby(["col2"]).agg(
    {
        "col1": "mean",           # get mean
        "col3": lambda x: list(x) # get list
    }
))

      col1    col3
col2              
a      1.5  [d, e]
b      3.5  [f, g]
c      3.0     [h]


## Dataframe.explode()

In [7]:
df = pd.DataFrame({"a": ["1,2", "4,5"],
                   "b": [11, 13]})

# Turn strings into lists
df.a = df.a.str.split(",")
print(df)

        a   b
0  [1, 2]  11
1  [4, 5]  13


In [8]:
df.explode("a", ignore_index=False)

Unnamed: 0,a,b
0,1,11
0,2,11
1,4,13
1,5,13


## Groupby().count vs Groupby( ).size

In [9]:
df = pd.DataFrame(
    {
        "col1": ["a", "b", "b", "c", "c", "d"],
        "col2": ["S", "S", "M", "L", "L", "L"]
    }
)

# get the count of elements in one column
df.groupby(["col1"]).count()

Unnamed: 0_level_0,col2
col1,Unnamed: 1_level_1
a,1
b,2
c,2
d,1


In [10]:
df.groupby(["col1", "col2"]).size()

col1  col2
a     S       1
b     M       1
      S       1
c     L       2
d     L       1
dtype: int64

## DF.corwith()

In [11]:
df1 = pd.DataFrame({
    "a": [1, 2, 3, 4],
    "b": [2, 3, 4, 6]
})

df2 = pd.DataFrame({
    "a": [1, 2, 3, 3],
    "b": [2, 2, 5, 4]
})

df1.corrwith(df2)

a    0.94388
b    0.68313
dtype: float64

## Cross Tabulation 2 

In [12]:
network = [
    ("Ben", "Smith"),
    ("Ben", "Patrick"),
    ("Warren", "Jone"),
    ("Warren", "Smith"),
    ("Smith", "Patrick"),
]

# Create a dataframe of the network
friends1 = pd.DataFrame(
    network, columns=["person1", "person2"]
)

# Create the order of the columns
friends2 = pd.DataFrame(
    network, columns=["person2", "person1"]
)

# Create a symmetric dataframe
friends = pd.concat([friends1, friends2])

# Create a cross tabulation
pd.crosstab(friends.person1, friends.person2)

person2,Ben,Jone,Patrick,Smith,Warren
person1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ben,0,0,1,1,0
Jone,0,0,0,0,1
Patrick,1,0,0,1,0
Smith,1,0,1,0,1
Warren,0,1,0,1,0


## Unpivot (melt)

In [13]:
df = pd.DataFrame({
    "fruit": ["apple", "orange"],
    "Aldi": [4, 5],
    "Walmart": [6, 7],
    "Costco": [1, 2]
})

df

Unnamed: 0,fruit,Aldi,Walmart,Costco
0,apple,4,6,1
1,orange,5,7,2


In [14]:
df.melt(id_vars=["fruit"],
        value_vars=["Aldi", "Walmart", "Costco"],
        var_name='store')

Unnamed: 0,fruit,store,value
0,apple,Aldi,4
1,orange,Aldi,5
2,apple,Walmart,6
3,orange,Walmart,7
4,apple,Costco,1
5,orange,Costco,2


## Rename aggregated column

In [15]:
df = pd.DataFrame({"size": ["S", "S", "M", "L"],
                   "price": [44, 29.99, 10, 19]})

df.groupby('size').agg({'price': 'mean'})

Unnamed: 0_level_0,price
size,Unnamed: 1_level_1
L,19.0
M,10.0
S,36.995


## Normalized value counts

In [16]:
size = pd.Series(["S", "S", "M", "L", "S", "XL", "S", "M",])

# Get count of each value
size.value_counts(normalize=True)

S     0.500
M     0.250
L     0.125
XL    0.125
Name: proportion, dtype: float64

## Fill in NULL Values

In [17]:
store1 = pd.DataFrame({
    "orange": [None, 5, 9],
    "apple": [4, None, 12]
})

store2 = pd.DataFrame({
    "orange": [31, 52, 91],
    "apple": [11, 71, 21]
})

# Fill null values of the store1 with values at the same
# locations from store2
store1.combine_first(store2)

Unnamed: 0,orange,apple
0,31.0,4.0
1,5.0,71.0
2,9.0,12.0


## Value_Count missing values

In [18]:
size = pd.Series(["S", "S", None, "M", "L", "S", None, "XL", "S", "M",])

# Get count of each value, it does not count missing values
size.value_counts()


# pass dropna=False to get missing value count
size.value_counts(dropna=False)

S       4
None    2
M       2
L       1
XL      1
Name: count, dtype: int64

## Filter Columns 

In [20]:
df = pd.DataFrame({'Temp': ['Hot', 'Cold', 'Warm', 'Cold'],
                   'Degree': [35, 3, 15, 2]})
print(df)

df = pd.get_dummies(df, columns=['Temp'],dtype=int)
print(df)

print(df.filter(like='Temp', axis=1))


   Temp  Degree
0   Hot      35
1  Cold       3
2  Warm      15
3  Cold       2
   Degree  Temp_Cold  Temp_Hot  Temp_Warm
0      35          0         1          0
1       3          1         0          0
2      15          0         0          1
3       2          1         0          0
   Temp_Cold  Temp_Hot  Temp_Warm
0          0         1          0
1          1         0          0
2          0         0          1
3          1         0          0


## Multiple columns assign using assing

In [22]:
time_sentences = ["Saturday: Weekend (Not working day)",
                  "Sunday: Weekend (Not working day)",
                  "Monday: Doctor appointment at 2:45pm.",
                  "Tuesday: Dentist appointment at 11:30 am.",
                  "Wednesday: basketball game At 7:00pm",
                  "Thursday: Back home by 11:15 pm.",
                  "Friday: Take the train at 08:10 am."]

df = pd.DataFrame(time_sentences, columns=['text'])

# Use Assign instead of using direct assignment 
# df['text'] = df.text.str.lower()
# df['text_len'] = df.text.str.len()
# df['word_count'] = df.text.str.count(" ") + 1
# df['weekend'] = df.text.str.contains("saturday|sunday", case=False)
print((
    df
    .assign(text=df.text.str.lower(),
            text_len=df.text.str.len(),
            word_count=df.text.str.count(" ") + 1,
            weekend=df.text.str.contains("saturday|friday", case=False),
           )
))

                                        text  text_len  word_count  weekend
0        saturday: weekend (not working day)        35           5     True
1          sunday: weekend (not working day)        33           5    False
2      monday: doctor appointment at 2:45pm.        37           5    False
3  tuesday: dentist appointment at 11:30 am.        41           6    False
4       wednesday: basketball game at 7:00pm        36           5    False
5           thursday: back home by 11:15 pm.        32           6    False
6        friday: take the train at 08:10 am.        35           7     True


## Reading html tables

In [25]:
# Without a marcher we will get a list of all tables in the 
# page. To make a table selection, pass table title to the 
# match parameter
table = pd.read_html(
    "https://en.wikipedia.org/wiki/Minnesota", 
    match="Average daily"
)

print(table[0].head())

      Location July (°F) July (°C) January (°F) January (°C)
0  Minneapolis     83/64     28/18         23/7       −4/−13
1   Saint Paul     83/63     28/17         23/6       −5/−14
2    Rochester     82/63     28/17         23/3       −5/−16
3       Duluth     76/55     24/13         19/1       −7/−17
4    St. Cloud     81/58     27/14        18/−1       −7/−18


## Ranking 

In [27]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Marks': [80, 56, 95, 75, 45]})
print(df)

df["Rank"] = df["Marks"].rank(ascending=False).astype(int)
print(df)

  Students  Marks
0     John     80
1    Smith     56
2  Patrick     95
3      Bob     75
4     Jose     45
  Students  Marks  Rank
0     John     80     2
1    Smith     56     4
2  Patrick     95     1
3      Bob     75     3
4     Jose     45     5


## Color Values in DataFrame

In [28]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Physics': [80, 56, 95, 75, 45], 
                   'Mathematics': [90, 85, 55, 65, 75]})
df.set_index('Students', inplace=True)




In [30]:
def pass_condition(val):
    color = 'blue' if val > 70 else 'red'
    return f"background-color: {color}"

df.style.map(pass_condition)

Unnamed: 0_level_0,Physics,Mathematics
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
John,80,90
Smith,56,85
Patrick,95,55
Bob,75,65
Jose,45,75
