### Notes on Data Transformation in `pandas`
File: `pd_05_trans.ipynb` <br>
Xuhua Huang <br>
Last updated: August 28, 2022 <br>
Created on: August 28, 2022

In [1]:
import pandas as pd
import numpy as np

#### Removing duplicated values in `pd.DataFrame`

In [2]:
data: pd.DataFrame = pd.DataFrame(
    {
        'c1': ['one', 'two']*3 + ['two'],
        'c2': [1, 1, 2, 3, 3, 4, 4]
    }
)
data

Unnamed: 0,c1,c2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
"""
pd.DataFrame.duplicated() returns a series of boolean value
indicating whether the value at such index has shown up before.
"""
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [4]:
isinstance(data.duplicated(), pd.Series)

True

In [5]:
"""
pd.DataFrame.drop_duplicates() returns a new DataFrame
whose content would be the ones with value `False` from the duplicated() method
"""
data.drop_duplicates()

Unnamed: 0,c1,c2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [6]:
data.drop_duplicates() is data

False

In [7]:
"""
using keep='last' to remove the first occurred duplicated values
After running the following method, verify that element indexed 5 is removed
and the one indexed 6 is kept
"""
data.drop_duplicates(subset=['c1', 'c2'], keep='last')

Unnamed: 0,c1,c2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


In [8]:
data.drop_duplicates(subset=['c1', 'c2'], keep='last', inplace=True)
data

Unnamed: 0,c1,c2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


#### Using function/lambda in `pd.DataFrame.map()`

In [9]:
# define preliminary data to be used
# when later constructing the DataFrame
type_of_meat: list[str] = [
    'bacon', 'pulled pork', 'bacon',
    'Pastrami', 'corned beef', 'Bacon',
    'pastrami', 'honey ham', 'nova lox'
]
ounces: list[np.float16] = [4., 3., 12., 6., 7.5, 8, 3, 5, 6]

meat_data: pd.DataFrame = pd.DataFrame({
    'food': type_of_meat,
    'ounces': ounces
})
meat_data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [10]:
# notice that in the list `type_of_meat`
# casing are not consistent across
# calling pd.Series.str.lower() to convert all the elements to lower-case
lower_cased: pd.Series = meat_data['food'].str.lower()
lower_cased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [11]:
isinstance(lower_cased, pd.Series)

True

In [12]:
meat_data['food'].unique()

array(['bacon', 'pulled pork', 'Pastrami', 'corned beef', 'Bacon',
       'pastrami', 'honey ham', 'nova lox'], dtype=object)

In [13]:
# define a function to map type of meat to a type of animal
meat_to_animal_mapping: dict[str, str] = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [14]:
# adding a new column in the DataFrame for the mapping
meat_data['animal'] = lower_cased.map(meat_to_animal_mapping)
meat_data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
