# Data Wrangling

* Mapping
* Sampling
* Dummy Variables / One-hot encoding
    * Value counts
* Joins
* Pivot tables
* Groupbys

## Mapping

In [88]:
import numpy as np 
import pandas as pd 

meat_data = pd.DataFrame({
    "food" : ['bacon', 'pulled pork', 'bacon', 'pastrami', 'corned beef', 'bacon', 'pastrami', 'honey ham', 'nova lox'],
    "ounces" : [4,3,12,6,7.5,8,3,5,6]
})

meat_data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [89]:
meat_to_animal = {
    'bacon' : 'pig',
    'pulled pork' : 'pig',
    'pastrami' : 'cow',
    'corned beef' : 'cow',
    'honey ham' : 'pig',
    'nova lox' : 'salmon'
}

In [90]:
meat_data['animal'] = meat_data['food'].map(meat_to_animal)
meat_data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


## Sampling

In [91]:
sample = np.random.permutation(9)
sample

array([0, 1, 8, 5, 6, 7, 3, 2, 4])

In [92]:
meat_data.iloc[sample]

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
8,nova lox,6.0,salmon
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
3,pastrami,6.0,cow
2,bacon,12.0,pig
4,corned beef,7.5,cow


In [93]:
meat_data.iloc[sample[:4]]
# both of these are doing the same thing in getting a random sample of 4 things from the data set
meat_data.sample(n=4)

Unnamed: 0,food,ounces,animal
4,corned beef,7.5,cow
5,bacon,8.0,pig
2,bacon,12.0,pig
0,bacon,4.0,pig


## Dummy Variables / One-Hot Encoding

In [94]:
meat_data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [95]:
animals = pd.get_dummies(meat_data['animal']).astype(int)
animals

Unnamed: 0,cow,pig,salmon
0,0,1,0
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
5,0,1,0
6,1,0,0
7,0,1,0
8,0,0,1


In [96]:
animals.sum()

cow       3
pig       5
salmon    1
dtype: int64

In [97]:
# animals.std(ddof=0) # Population (divide by n)
animals.std(ddof=1) # Sample (divide by n-1)

cow       0.500000
pig       0.527046
salmon    0.333333
dtype: float64

In [98]:
animals.describe()

Unnamed: 0,cow,pig,salmon
count,9.0,9.0,9.0
mean,0.333333,0.555556,0.111111
std,0.5,0.527046,0.333333
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,0.0
75%,1.0,1.0,0.0
max,1.0,1.0,1.0


In [99]:
meat_data.describe()

Unnamed: 0,ounces
count,9.0
mean,6.055556
std,2.855307
min,3.0
25%,4.0
50%,6.0
75%,7.5
max,12.0


In [100]:
import seaborn as sns 
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [101]:
titanic['pclass'].unique()

array([3, 1, 2], dtype=int64)

In [102]:
for pclass in titanic['pclass'].unique():
    print(f"Class: {pclass}")
    display(titanic[titanic['pclass'] == pclass].describe())

Class: 3


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,491.0,491.0,355.0,491.0,491.0,491.0
mean,0.242363,3.0,25.14062,0.615071,0.393075,13.67555
std,0.428949,0.0,12.495398,1.374883,0.888861,11.778142
min,0.0,3.0,0.42,0.0,0.0,0.0
25%,0.0,3.0,18.0,0.0,0.0,7.75
50%,0.0,3.0,24.0,0.0,0.0,8.05
75%,0.0,3.0,32.0,1.0,0.0,15.5
max,1.0,3.0,74.0,8.0,6.0,69.55


Class: 1


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,216.0,216.0,186.0,216.0,216.0,216.0
mean,0.62963,1.0,38.233441,0.416667,0.356481,84.154687
std,0.484026,0.0,14.802856,0.611898,0.693997,78.380373
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,27.0,0.0,0.0,30.92395
50%,1.0,1.0,37.0,0.0,0.0,60.2875
75%,1.0,1.0,49.0,1.0,0.0,93.5
max,1.0,1.0,80.0,3.0,4.0,512.3292


Class: 2


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,184.0,184.0,173.0,184.0,184.0,184.0
mean,0.472826,2.0,29.87763,0.402174,0.380435,20.662183
std,0.500623,0.0,14.001077,0.601633,0.690963,13.417399
min,0.0,2.0,0.67,0.0,0.0,0.0
25%,0.0,2.0,23.0,0.0,0.0,13.0
50%,0.0,2.0,29.0,0.0,0.0,14.25
75%,1.0,2.0,36.0,1.0,1.0,26.0
max,1.0,2.0,70.0,3.0,3.0,73.5


In [103]:
titanic['pclass'].value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [104]:
titanic[['pclass', 'survived']].value_counts()

pclass  survived
3       0           372
1       1           136
3       1           119
2       0            97
        1            87
1       0            80
Name: count, dtype: int64

In [105]:
titanic[['pclass', 'survived', 'sex']].value_counts()

pclass  survived  sex   
3       0         male      300
1       1         female     91
2       0         male       91
1       0         male       77
3       0         female     72
        1         female     72
2       1         female     70
3       1         male       47
1       1         male       45
2       1         male       17
        0         female      6
1       0         female      3
Name: count, dtype: int64

In [106]:
meat_data.value_counts()

food         ounces  animal
bacon        4.0     pig       1
             8.0     pig       1
             12.0    pig       1
corned beef  7.5     cow       1
honey ham    5.0     pig       1
nova lox     6.0     salmon    1
pastrami     3.0     cow       1
             6.0     cow       1
pulled pork  3.0     pig       1
Name: count, dtype: int64

In [107]:
meat_data['food'].value_counts()

food
bacon          3
pastrami       2
pulled pork    1
corned beef    1
honey ham      1
nova lox       1
Name: count, dtype: int64

## Ordinal Encoding

In [108]:
meat_data['Quality'] = np.array(['Great', 'Poor', 'Good', 'Excellent', 'Disgusting', 'Great', 'Excellent', 'Great', 'Good'])
meat_data

Unnamed: 0,food,ounces,animal,Quality
0,bacon,4.0,pig,Great
1,pulled pork,3.0,pig,Poor
2,bacon,12.0,pig,Good
3,pastrami,6.0,cow,Excellent
4,corned beef,7.5,cow,Disgusting
5,bacon,8.0,pig,Great
6,pastrami,3.0,cow,Excellent
7,honey ham,5.0,pig,Great
8,nova lox,6.0,salmon,Good


In [109]:
rating = {
    'Excellent' : 4,
    'Great' : 3,
    'Good' : 2,
    'Poor' : 1,
    'Disgusting' : 0 
}

# I am setting it to a new column of rating because I like how it looks
# In class he had it overwrite the quality column but I like to keep the categorical variable there. 
meat_data['Rating'] = meat_data['Quality'].map(rating)
meat_data

Unnamed: 0,food,ounces,animal,Quality,Rating
0,bacon,4.0,pig,Great,3
1,pulled pork,3.0,pig,Poor,1
2,bacon,12.0,pig,Good,2
3,pastrami,6.0,cow,Excellent,4
4,corned beef,7.5,cow,Disgusting,0
5,bacon,8.0,pig,Great,3
6,pastrami,3.0,cow,Excellent,4
7,honey ham,5.0,pig,Great,3
8,nova lox,6.0,salmon,Good,2
