# Pandas Review

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv('./data/tips.csv')
data.columns = ['bill', 'tip', 'gender', 'smoker', 'day', 'time', 'size', 'fraction']
print(data.shape)
data.head()

(244, 8)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [2]:
import random
# generate a random name column
first = ['tom', 'dick', 'harry', 'helen', 'jane', 'ema']
last = ['smith', 'brown', 'jones', 'green', 'white', 'watson']
data['name'] = ['{} {}'.format(random.choice(first), random.choice(last)) for i in range(len(data))]

print(data.shape)
data.head()

(244, 9)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,jane green
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,jane white
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,dick white
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,dick watson
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,harry smith


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 9 columns):
bill        244 non-null float64
tip         244 non-null float64
gender      244 non-null object
smoker      244 non-null object
day         244 non-null object
time        244 non-null object
size        244 non-null int64
fraction    244 non-null float64
name        244 non-null object
dtypes: float64(3), int64(1), object(5)
memory usage: 17.2+ KB


In [4]:
data['first_name'] = data.name.apply(lambda x: x.split(' ')[0])
data['last_name'] = data.name.apply(lambda x: x.split(' ')[-1])

print(data.shape)
data.head()

(244, 11)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name,first_name,last_name
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,jane green,jane,green
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,jane white,jane,white
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,dick white,dick,white
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,dick watson,dick,watson
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,harry smith,harry,smith


In [8]:
data['tax'] = data.bill * 0.3
data['total'] = data.bill + data.tip + data.tax

print(data.shape)
data.head()

(244, 13)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name,first_name,last_name,tax,total
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,jane green,jane,green,5.097,23.097
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,jane white,jane,white,3.102,15.102
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,dick white,dick,white,6.303,30.813
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,dick watson,dick,watson,7.104,34.094
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,harry smith,harry,smith,7.377,35.577


In [12]:
# add time tax where time == 'Lunch'
time_tax = lambda time: True if time == 'Lunch' else False
data['time_tax'] = data.time.apply(time_tax)

data.head()

Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name,first_name,last_name,tax,total,time_tax
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,jane green,jane,green,5.097,23.097,False
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,jane white,jane,white,3.102,15.102,False
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,dick white,dick,white,6.303,30.813,False
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,dick watson,dick,watson,7.104,34.094,False
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,harry smith,harry,smith,7.377,35.577,False


In [13]:
# use 'axis=1' with 'apply()' to pass the entire row to lambda
meal_tax = lambda row: row.bill * 0.2 if row.time_tax else 0
data['meal_tax'] = data.apply(meal_tax, axis=1)
print(data.meal_tax.sum())
df = data[data.meal_tax > 0] # return those rows where the meal_tax > 0

print(df.shape)
df.head()

233.494
(68, 15)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name,first_name,last_name,tax,total,time_tax,meal_tax
77,27.2,4.0,Male,No,Thur,Lunch,4,0.147059,helen white,helen,white,8.16,39.36,True,5.44
78,22.76,3.0,Male,No,Thur,Lunch,2,0.13181,dick white,dick,white,6.828,32.588,True,4.552
79,17.29,2.71,Male,No,Thur,Lunch,2,0.156738,dick jones,dick,jones,5.187,25.187,True,3.458
80,19.44,3.0,Male,Yes,Thur,Lunch,2,0.154321,tom white,tom,white,5.832,28.272,True,3.888
81,16.66,3.4,Male,No,Thur,Lunch,2,0.204082,tom smith,tom,smith,4.998,25.058,True,3.332


In [14]:
total_bill = lambda row: row.bill + row.tip + row.tax + row.meal_tax
data['total'] = data.apply(total_bill, axis=1)
df = data[data.meal_tax > 0]

print(df.shape)
df.head()

(68, 15)


Unnamed: 0,bill,tip,gender,smoker,day,time,size,fraction,name,first_name,last_name,tax,total,time_tax,meal_tax
77,27.2,4.0,Male,No,Thur,Lunch,4,0.147059,helen white,helen,white,8.16,44.8,True,5.44
78,22.76,3.0,Male,No,Thur,Lunch,2,0.13181,dick white,dick,white,6.828,37.14,True,4.552
79,17.29,2.71,Male,No,Thur,Lunch,2,0.156738,dick jones,dick,jones,5.187,28.645,True,3.458
80,19.44,3.0,Male,Yes,Thur,Lunch,2,0.154321,tom white,tom,white,5.832,32.16,True,3.888
81,16.66,3.4,Male,No,Thur,Lunch,2,0.204082,tom smith,tom,smith,4.998,28.39,True,3.332


In [15]:
# rename certain columns {'old_name': 'new_name'}
data.rename(columns={'name': 'full_name', 'tax': 'general_tax'}, inplace=True)
data.columns

Index(['bill', 'tip', 'gender', 'smoker', 'day', 'time', 'size', 'fraction',
       'full_name', 'first_name', 'last_name', 'general_tax', 'total',
       'time_tax', 'meal_tax'],
      dtype='object')

In [16]:
print('Total ${:0.2f}'.format(data.total.sum()))
print('Tips ${:0.2f}'.format(data.tip.sum()))
data.total.describe()

Total $7241.18
Tips $731.58


count    244.000000
mean      29.676947
std       12.760428
min        4.991000
25%       21.046000
50%       26.925000
75%       35.869500
max       76.053000
Name: total, dtype: float64

In [17]:
data.day.unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [18]:
data.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [19]:
data.groupby('time').total.sum()

time
Dinner    5304.460
Lunch     1936.715
Name: total, dtype: float64

In [20]:
data.groupby('gender').gender.count()

gender
Female     87
Male      157
Name: gender, dtype: int64

In [21]:
data.groupby('smoker').smoker.count()

smoker
No     151
Yes     93
Name: smoker, dtype: int64

In [22]:
data.groupby('smoker').total.sum() # total spent 

smoker
No     4371.887
Yes    2869.288
Name: total, dtype: float64

In [23]:
data.groupby('smoker').tip.sum()

smoker
No     451.77
Yes    279.81
Name: tip, dtype: float64

In [24]:
data.groupby('day').day.count() # returns a series

day
Fri     19
Sat     87
Sun     76
Thur    62
Name: day, dtype: int64

In [25]:
data.groupby('day').total.sum().reset_index() # returns a dataframe

Unnamed: 0,day,total
0,Fri,493.588
1,Sat,2572.32
2,Sun,2362.698
3,Thur,1812.569


In [26]:
data.groupby(['day', 'time', 'gender', 'smoker']).total.sum()

day   time    gender  smoker
Fri   Dinner  Female  No          32.825
                      Yes         74.240
              Male    No          50.435
                      Yes        184.528
      Lunch   Female  No          26.970
                      Yes         67.650
              Male    Yes         56.940
Sat   Dinner  Female  No         356.585
                      Yes        438.230
              Male    No         933.259
                      Yes        844.246
Sun   Dinner  Female  No         425.612
                      Yes        100.008
              Male    No        1274.502
                      Yes        562.576
Thur  Dinner  Female  No          27.414
      Lunch   Female  No         630.860
                      Yes        222.725
              Male    No         613.425
                      Yes        318.145
Name: total, dtype: float64

### Handling multiple dataframes

In [None]:
df_cars = pd.read_csv('../data/cars.csv')
df_cars.columns = ['code', 'numbers', 'country', 'drives_right']
df_cars.head()

In [None]:
df_cars2 = pd.read_csv('../data/cars2.csv', header=None, names=['code', 'numbers', 'country', 'drives_right'])
df_cars2.head()

In [None]:
# concat, 'stack', two data frames, drop 'index' column
data = pd.concat([df_cars, df_cars2]).reset_index(drop=True)
print(data.shape)
data

In [None]:
cars_exd = pd.read_csv('../data/cars_exd.csv', header=None, names=['road_tax', 'colour', 'fuel'])
print(cars_exd.shape)
cars_exd.head(2)

In [None]:
cars_exd['country'] = data['country']
cars_exd

In [None]:
pd.merge(data, cars_exd)