# Programming for Data Science Summary
## Chapter 05 - Data Wrangling with Pandas


In [2]:
import pandas as pd
import numpy as np
print("> Pandas Version:",pd.__version__)

df_wide = pd.read_excel('./countries.xlsx')

df = df_wide.copy()

netflix = pd.read_csv("netflix_dataset.csv")
df_explosive = df4 = pd.DataFrame(netflix[netflix.Country=='Canada'].groupby('Subscription Type')['Monthly Revenue'].apply(list))

> Pandas Version: 2.2.2


### Transformation from long to wide (and viceversa)

**GOAL.** To see ways to perform the transformation ${df}_\text{long} \leftrightarrow {df}_\text{wide}$

| Wide format | Long format |
| ----- | ----- |
| Columns per attribute | Column for subject, attribute and values |
| Rows per subject | Rows per subject-attribute |
| No repeated subjects but possible missing values | Repeated subjects but no missing values |
| <img src="https://preview.redd.it/reshaping-table-w-tens-of-millions-of-rows-from-long-to-wide-v0-qlpweqqts66a1.png?width=1334&format=png&auto=webp&s=9d7ccfef49690095f13afa0fb45cebbccc091cd1" width=400> | <img src="https://preview.redd.it/reshaping-table-w-tens-of-millions-of-rows-from-long-to-wide-v0-ijzw95ios66a1.png?width=1316&format=png&auto=webp&s=8aa3be9405c66da96e896a7fe6863564a673ebe2" width=450> |

In [24]:
# Standard functions
pd.wide_to_long(
    df_wide, # Data
    i = "country", # Fixed column
    stubnames=["pop"], # Define a stubname for our secondary index
    j="year" # Define suffix for the stubname
) # Selects a fixed column to set as index, and searches for other columns to set as a "secondary index" following a pattern.
#   The rest of columns are left intact.

display()

In [19]:
# Melting and unmelting
df_melted = pd.melt(
    df_wide, # Data target
    id_vars = ['country'], # Fixed column
    value_vars=['pop1980','pop2000','pop2010','pop2022','pop2023','pop2030','pop2050'], # Columns to melt
    var_name = "Country", # Name of the fixed column
) # Melts our data into a long DataFrame, by admitting a fixed column as our "index" and "melting" the rest as "values"

x = pd.pivot(
    df_melted, # Typically should be a melted DF in order to use this method in this way
    index = 'country', # Index column
    columns = 'Country' # Value coumn
) # Reverts a melted DataFrame by expanding the melted values

display()

In [4]:
# Stacking and Unstacking
df_wide.stack(level=-1) # Stacks the value_vars into one single column, beginning from the defined level.

df_stacked = df_wide.stack() # Get a stacked Dataframe
df_stacked.unstack(level=-1) # Unstack dataframe, beginning from a defined level.

display()

In [5]:
# Exploding
df_explosive.explode('Monthly Revenue') # Explodes the array inputs into a long format by repeating the rows

display()

### Beautifying DataFrames
Make DataFrames simpler to visualize (instead of using other visualization tools) to gain insights

In [6]:
# Map Applier

def mapper(value):
    color = '#FFFF00' # Some color in HEX code to be determined depending on the conditions
    return f'color: {color}' # Apply color CSS

df.style.map(mapper) # Given a mapper, maps the text-data into a certain color, following the mapper logic.
 
display()

In [7]:
# Highlighter
df_num = df.select_dtypes(exclude='O')

df_num.style.highlight_max(color = 'lightgreen') # Highlight maximum for every column
df_num.style.highlight_min(color = 'red') # Same but with minimum
df_num.style.highlight_null(color = 'black') # Same but with NaN values
df_num.style.highlight_between(left=1, right=5, color = 'blue') # Same but with an interval

# OSS: To select only certain columns to do this operation on, you can specify the optional argument subset=[...]

display()

In [8]:
# Bar
df_num.style.bar(
    align = "mean", # Specify the criterion that the bar follows
    color = ['#FF00FF', '#0000FF'] # Select colours
) # Decorates every datapoint with a bar

display()