In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# SAMPLE

Let's you get samples from a dtaframe in a pretty powerful diverse way

In [3]:
tips.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
65,20.08,3.15,Male,No,Sat,Dinner,3
59,48.27,6.73,Male,No,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
87,18.28,4.0,Male,No,Thur,Lunch,2
142,41.19,5.0,Male,No,Thur,Lunch,5


In [4]:
tips.sample?

# ISIN

It is applied to an entire column and is very useful in selecting specific rows

In [5]:
is_weekend = tips.day.isin(["Sat", "Sun"])
is_weekend

0       True
1       True
2       True
3       True
4       True
       ...  
239     True
240     True
241     True
242     True
243    False
Name: day, Length: 244, dtype: bool

In [6]:
tips[is_weekend].sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
108,18.24,3.76,Male,No,Sat,Dinner,2
208,24.27,2.03,Male,Yes,Sat,Dinner,2
168,10.59,1.61,Female,Yes,Sat,Dinner,2
24,19.82,3.18,Male,No,Sat,Dinner,2
111,7.25,1.0,Female,No,Sat,Dinner,1


In [7]:
tips.isin?

# DROP_DUBLICATES

Works on more than one column

In [8]:
tips[["time", "day"]].drop_duplicates(keep="first")

Unnamed: 0,time,day
0,Dinner,Sun
19,Dinner,Sat
77,Lunch,Thur
90,Dinner,Fri
220,Lunch,Fri
243,Dinner,Thur


# CUT

This will cut your numeric data into equal buckets and then assign them labels depending on the bucket

In [9]:
pd.cut(tips["total_bill"], 3, labels=["low", "mid", "high"]).head(15)

0      low
1      low
2      mid
3      mid
4      mid
5      mid
6      low
7      mid
8      low
9      low
10     low
11    high
12     low
13     low
14     low
Name: total_bill, dtype: category
Categories (3, object): ['low' < 'mid' < 'high']

# STR

In [10]:
tips.sex.str.lower().head()

0    female
1      male
2      male
3      male
4    female
Name: sex, dtype: object

# NaNs

In [11]:
tips.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [12]:
tips.tip.fillna(0, inplace=True)

In [13]:
tips.dropna(axis=1, how="any", inplace=True)

# CORR

In [14]:
tips.corr()

Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0


In [15]:
tips[["total_bill", "tip"]].corr()

Unnamed: 0,total_bill,tip
total_bill,1.0,0.675734
tip,0.675734,1.0


# RANK

This will calculate what rank each entry is in the column

In [16]:
tips.tip.rank().head()

0      5.0
1     33.0
2    177.0
3    165.0
4    185.0
Name: tip, dtype: float64

# RENAME

In [17]:
tips.rename(columns={"total_bill": "bill"}, inplace=True)

In [18]:
tips

Unnamed: 0,bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# ITERTUPLES

Can be used for small dataframes

In [19]:
for tup in tips.itertuples():
    print(tup)

Pandas(Index=0, bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=1, bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3)
Pandas(Index=2, bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3)
Pandas(Index=3, bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=4, bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=5, bill=25.29, tip=4.71, sex='Male', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=6, bill=8.77, tip=2.0, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=7, bill=26.88, tip=3.12, sex='Male', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=8, bill=15.04, tip=1.96, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=9, bill=14.78, tip=3.23, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=10, bill=10.27, 