# Pandas
## Learning Objectives
    1. Loading a comma-seprated-value(CSV) dataset
    2. Grouping data by value
    3. Creating pivot tables
    4. Relational Operation

In [2]:
# imports
import pandas as pd

In [5]:
# read the file
tips_df = pd.read_csv('tips.csv')
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Group by

In [6]:
help(tips.groupby)

Help on method groupby in module pandas.core.frame:

groupby(by=None, axis: 'Axis' = 0, level: 'Level | None' = None, as_index: 'bool' = True, sort: 'bool' = True, group_keys: 'bool' = True, squeeze: 'bool | lib.NoDefault' = <no_default>, observed: 'bool' = False, dropna: 'bool' = True) -> 'DataFrameGroupBy' method of pandas.core.frame.DataFrame instance
    Group DataFrame using a mapper or by a Series of columns.
    
    A groupby operation involves some combination of splitting the
    object, applying a function, and combining the results. This can be
    used to group large amounts of data and compute operations on these
    groups.
    
    Parameters
    ----------
    by : mapping, function, label, or list of labels
        Used to determine the groups for the groupby.
        If ``by`` is a function, it's called on each value of the object's
        index. If a dict or Series is passed, the Series or dict VALUES
        will be used to determine the groups (the Series' values

In [8]:
# grouping based on gender and applying mean
# mean is only applied to numeric values, therefor, Time is not included here.
mean = tips_df.groupby(['sex']).mean()
mean

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,18.056897,2.833448,2.45977
Male,20.744076,3.089618,2.630573


In [9]:
mean.index

Index(['Female', 'Male'], dtype='object', name='sex')

    Group and then filter before grouping
    the follwing sample groups based on gender and only shows those that their tip is higher than 2.9 ...

In [14]:
# the women are removed from the output!!
tips_df.groupby('sex').filter(lambda k : k['tip'].mean() > 2.9).mean()

  tips_df.groupby('sex').filter(lambda k : k['tip'].mean() > 2.9).mean()


total_bill    20.744076
tip            3.089618
size           2.630573
dtype: float64

In [12]:
# nested group-by
tips.groupby('sex').filter(lambda k : k['tip'].mean() > 2.9).groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.744076,3.089618,2.630573


In [13]:
# multi dimensional group-by
tips.groupby(['sex','smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,18.105185,2.773519,2.592593
Female,Yes,17.977879,2.931515,2.242424
Male,No,19.791237,3.113402,2.71134
Male,Yes,22.2845,3.051167,2.5


In [34]:
# this is a pivot table for tips dataframe showing the total_bill and
# grouped by sex and smoker
pd.pivot_table(tips,'total_bill','sex','smoker')

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,18.105185,17.977879
Male,19.791237,22.2845


### Pivote Table
    this is a pivot table for tips dataframe showing the total_bill which is
    grouped by sex and smoker in one dimension and by day and time in the other

In [32]:
pd.pivot_table(tips,'total_bill', ['sex','smoker'], ['day','time'])

Unnamed: 0_level_0,day,Fri,Fri,Sat,Sun,Thur,Thur
Unnamed: 0_level_1,time,Dinner,Lunch,Dinner,Dinner,Dinner,Lunch
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,22.75,15.98,19.003846,20.824286,18.78,15.899167
Female,Yes,12.2,13.26,20.266667,16.54,,19.218571
Male,No,17.475,,19.929063,20.403256,,18.4865
Male,Yes,25.892,11.386667,21.837778,26.141333,,19.171


### Relational Operation

In [15]:
dummy_data_1 = {
        'id':       ['1', '2', '3', '4', '5'],
        'Feature_1': ['A', 'C', 'E', 'G', 'I'],
        'Feature_2': ['B', 'D', 'F', 'H', 'J']
}
df1 = pd.DataFrame(dummy_data_1)
df1

Unnamed: 0,id,Feature_1,Feature_2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J


In [16]:
dummy_data_2 = {
        'id':       ['1', '2', '6', '7', '8'],
        'Feature_1': ['K', 'M', 'O', 'Q', 'S'],
        'Feature_2': ['L', 'N', 'P', 'R', 'T']}
df2 = pd.DataFrame(dummy_data_2)
df2

Unnamed: 0,id,Feature_1,Feature_2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [17]:
dummy_data_3 = {
        'id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'Feature_3': [12, 13, 14, 15, 16, 17, 15, 12, 13, 23]}
df3 = pd.DataFrame(dummy_data_3, columns = ['id', 'Feature_3'])
df3

Unnamed: 0,id,Feature_3
0,1,12
1,2,13
2,3,14
3,4,15
4,5,16
5,7,17
6,8,15
7,9,12
8,10,13
9,11,23


### merge

In [18]:
df_merge_col = pd.merge(df1, df2, on='id')

df_merge_col

NameError: name 'w' is not defined

In [19]:
print("specifying keys for each dataframe separately!")
df_merge_difkey = pd.merge(df_row, df3, left_on='id', right_on='id')

df_merge_difkey

specifying keys for each dataframe separately!


NameError: name 'df_row' is not defined

In [None]:
df_outer = pd.merge(df1, df2, on='id', how='outer')

df_outer

In [None]:
df_inner = pd.merge(df1, df2, on='id', how='inner')

df_inner

In [None]:
df_right = pd.merge(df1, df2, on='id', how='right')

df_right

In [None]:
df_left = pd.merge(df1, df2, on='id', how='left')

df_left


*:)*