In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

# REVIEW THE BASICS

In [2]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# get columns
tips[['total_bill', "smoker"]].head()

Unnamed: 0,total_bill,smoker
0,16.99,No
1,10.34,No
2,21.01,No
3,23.68,No
4,24.59,No


In [4]:
# get columns
tips["smoker"].head()

0    No
1    No
2    No
3    No
4    No
Name: smoker, dtype: category
Categories (2, object): ['Yes', 'No']

In [5]:
# get tows
tips[3:5]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
# get rows amd columns based on their name
tips.loc[2:4, "sex":"smoker"]

Unnamed: 0,sex,smoker
2,Male,No
3,Male,No
4,Female,No


In [7]:
# get rows amd columns by their ordering
tips.iloc[1:3, 0:2]

Unnamed: 0,total_bill,tip
1,10.34,1.66
2,21.01,3.5


In [8]:
# select using a bool series
tips[tips["tip"] > 3].head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4


# MULTI-INDEX

In [9]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
tips.groupby(["sex", "smoker"]).agg({"tip":"median"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,3.0
Male,No,2.74
Female,Yes,2.88
Female,No,2.68


In [11]:
tips.groupby(["sex", "smoker", "day"]).agg({"tip":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tip
sex,smoker,day,Unnamed: 3_level_1
Male,Yes,Thur,3.058
Male,Yes,Fri,2.74125
Male,Yes,Sat,2.879259
Male,Yes,Sun,3.521333
Male,No,Thur,2.9415
Male,No,Fri,2.5
Male,No,Sat,3.256563
Male,No,Sun,3.115349
Female,Yes,Thur,2.99
Female,Yes,Fri,2.682857


In [12]:
mi_tips = tips.groupby(["sex", "smoker"]).agg({"tip":"mean"})
mi_tips

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402
Female,Yes,2.931515
Female,No,2.773519


In [13]:
mi_tips.index

MultiIndex([(  'Male', 'Yes'),
            (  'Male',  'No'),
            ('Female', 'Yes'),
            ('Female',  'No')],
           names=['sex', 'smoker'])

In [14]:
mi_tips.loc[("Male", "No")]

tip    3.113402
Name: (Male, No), dtype: float64

In [15]:
mi_tips.loc["Male", "No"]

tip    3.113402
Name: (Male, No), dtype: float64

In [16]:
ri_tips = mi_tips.reset_index()
ri_tips

Unnamed: 0,sex,smoker,tip
0,Male,Yes,3.051167
1,Male,No,3.113402
2,Female,Yes,2.931515
3,Female,No,2.773519


In [17]:
# The same way to choose male non-smokers
ri_tips[(ri_tips["smoker"] == "No") & (ri_tips["sex"] == "Male")]

Unnamed: 0,sex,smoker,tip
1,Male,No,3.113402


In [18]:
mi_tips.reset_index(level=0)

Unnamed: 0_level_0,sex,tip
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,Male,3.051167
No,Male,3.113402
Yes,Female,2.931515
No,Female,2.773519


In [19]:
mi_tips.reset_index(level=1).loc["Male"]

Unnamed: 0_level_0,smoker,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402


In [20]:
ri0_tips = mi_tips.reset_index(level=0)
ri0_tips.loc["Yes"]

Unnamed: 0_level_0,sex,tip
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,Male,3.051167
Yes,Female,2.931515


In [21]:
ri_tips.set_index(["sex"])

Unnamed: 0_level_0,smoker,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402
Female,Yes,2.931515
Female,No,2.773519


In [22]:
ri_tips.set_index(["sex", "smoker"])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402
Female,Yes,2.931515
Female,No,2.773519


In [23]:
ri_tips.set_index("sex", append=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,smoker,tip
Unnamed: 0_level_1,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Male,Yes,3.051167
1,Male,No,3.113402
2,Female,Yes,2.931515
3,Female,No,2.773519


# GETTING SINGLE VALUES

In [24]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [25]:
# Using at is much faster than loc
tips.at[0, "total_bill"] = 9000
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,9000.0,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [26]:
tips.iat[0, 0]

9000.0

# WHERE, MASKS, QUERIES

In [27]:
# These things are faster than what was described prevously

In [28]:
df = pd.DataFrame(np.random.randn(25).reshape(5,5))
df.head()

Unnamed: 0,0,1,2,3,4
0,0.522885,-1.131639,1.422782,0.435936,1.359763
1,2.138247,1.516577,-0.622137,1.349703,-1.198687
2,-2.051206,0.55567,0.208896,0.520529,-0.982192
3,-0.94658,-0.421818,0.48855,0.901477,0.57271
4,0.377493,1.435266,0.698575,-0.271132,1.840246


In [29]:
df.where(df > 0)

Unnamed: 0,0,1,2,3,4
0,0.522885,,1.422782,0.435936,1.359763
1,2.138247,1.516577,,1.349703,
2,,0.55567,0.208896,0.520529,
3,,,0.48855,0.901477,0.57271
4,0.377493,1.435266,0.698575,,1.840246


In [30]:
df[df < 0] = np.NaN
df

Unnamed: 0,0,1,2,3,4
0,0.522885,,1.422782,0.435936,1.359763
1,2.138247,1.516577,,1.349703,
2,,0.55567,0.208896,0.520529,
3,,,0.48855,0.901477,0.57271
4,0.377493,1.435266,0.698575,,1.840246
