In [1]:
import pandas as pd
import numpy as np

### <center>Useful methods</center>

#### .apply() method

In [2]:
df = pd.read_csv("C:/Users/Lenovo/Desktop/Python/Data Science/Pandas/Extra files/tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


##### Get the last 4 digits of CC Number.

In [3]:
df["CC Number"].dtype # Int64, slicing can't be performed in integers, it must be converted to a str

dtype('int64')

In [4]:
def last4(num):
    return int(str(num)[-4:])


last4(12345567731233431)

3431

In [5]:
df["CC Number"].apply(last4)

0      3410
1      9230
2      1322
3      5994
4      7221
       ... 
239    2842
240    5404
241    7196
242     950
243    8139
Name: CC Number, Length: 244, dtype: int64

##### Total bill classifier

In [6]:
df["total_bill"].mean()

19.785942622950824

In [7]:
def yelp(price):
    if price < 10:
        return "$"
    elif 10 <= price <= 30:
        return "$$"
    else:
        return "$$$"

In [8]:
df["total_bill"].apply(yelp)

0      $$
1      $$
2      $$
3      $$
4      $$
       ..
239    $$
240    $$
241    $$
242    $$
243    $$
Name: total_bill, Length: 244, dtype: object

##### Working with more than 1 column

In [9]:
def qual(total_bill, tip):
    if tip/total_bill > 0.25:
        return "Generous"
    else:
        return "Other"

df[["total_bill", "tip"]].apply(lambda df: qual(df["total_bill"], df["tip"]), axis=1)

0      Other
1      Other
2      Other
3      Other
4      Other
       ...  
239    Other
240    Other
241    Other
242    Other
243    Other
Length: 244, dtype: object

.... another way to do it

In [10]:
np.vectorize(qual)(df["total_bill"], df["tip"])

array(['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Generous', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Generous', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Generous', 'Other', 'Other', 'Other', 'Other',
       'Oth

#### Sorting and statistical methods

In [11]:
df.sort_values(['tip', 'size'], ascending=[True, False]) # In sorting values, it sorts by the order of the list, same with 
                                                         # ascending statement

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
92,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
111,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
...,...,...,...,...,...,...,...,...,...,...,...
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [12]:
# Max and min values

print(df["total_bill"].max())     # Gets the max value
print(df["total_bill"].idxmin())  # Gets the index of the min value

50.81
67


In [13]:
# Correlation matrix --> only works with numeric values (Pearson correlation)

np.round(df.corr(), 3)

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.676,0.598,0.648,0.105
tip,0.676,1.0,0.489,0.347,0.111
size,0.598,0.489,1.0,-0.175,-0.03
price_per_person,0.648,0.347,-0.175,1.0,0.135
CC Number,0.105,0.111,-0.03,0.135,1.0


In [14]:
# Counts per category

df["sex"].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [15]:
# Unique values and number of unique values
print(df.day.unique())
print(df.day.nunique())

['Sun' 'Sat' 'Thur' 'Fri']
4


#### .replace() and .map method

In [16]:
df.sex.replace(["Female", "Male"], ["F", "M"])

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [17]:
mymap = {"Female": "F", 
        "Male": "M"}
df.sex.map(mymap)

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

#### Duplicates method

In [18]:
df.duplicated() # In this case there are no duplicated rows --> all False

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

In [19]:
simple_df = pd.DataFrame([1,2,2],['a','b','c'])
simple_df

Unnamed: 0,0
a,1
b,2
c,2


In [20]:
simple_df.duplicated()

a    False
b    False
c     True
dtype: bool

In [22]:
simple_df.drop_duplicates(inplace=True)
simple_df

Unnamed: 0,0
a,1
b,2


#### .between() method

In [24]:
df["total_bill"].between(10,20, "both") # both includes the two boundries

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

#### .nlargest() and .nsmallest() method

In [25]:
df.nlargest(5, "total_bill")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
156,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321,Sun7518
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337


In [26]:
df.nsmallest(5, "tip")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
111,7.25,1.0,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
236,12.6,1.0,Male,Yes,Sat,Dinner,2,6.3,Matthew Myers,3543676378973965,Sat5032
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959


#### .sample() method
Takes a random sample, it can be specified the numer of rows or based of a fraction of the data frame

In [27]:
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
203,16.4,2.5,Female,Yes,Thur,Lunch,2,8.2,Toni Brooks,3582289985920239,Thur7770
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
24,19.82,3.18,Male,No,Sat,Dinner,2,9.91,Christopher Ross,36739148167928,Sat6236
87,18.28,4.0,Male,No,Thur,Lunch,2,9.14,Donald Williams,5363745772301404,Thur3636
53,9.94,1.56,Male,No,Sun,Dinner,2,4.97,Curtis Morgan,4628628020417301,Sun4561


In [28]:
df.sample(frac=0.25)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
20,17.92,4.08,Male,No,Sat,Dinner,2,8.96,Thomas Rice,4403296224639756,Sat1709
151,13.13,2.00,Male,No,Sun,Dinner,2,6.56,Jason Arnold,3571825125296106,Sun2127
47,32.40,6.00,Male,No,Sun,Dinner,4,8.10,James Barnes,3552002592874186,Sun9677
213,13.27,2.50,Female,Yes,Sat,Dinner,2,6.64,Robin Andersen,580140531089,Sat1374
81,16.66,3.40,Male,No,Thur,Lunch,2,8.33,William Martin,4550549048402707,Thur8232
...,...,...,...,...,...,...,...,...,...,...,...
43,9.68,1.32,Male,No,Sun,Dinner,2,4.84,Christopher Spears,4387671121369212,Sun3279
176,17.89,2.00,Male,Yes,Sun,Dinner,2,8.94,Walter Simmons,6011481578696110,Sun5961
225,16.27,2.50,Female,Yes,Fri,Lunch,2,8.14,Whitney Arnold,3579111947217428,Fri6665
146,18.64,1.36,Female,No,Thur,Lunch,3,6.21,Kelly Estrada,60463302327,Thur3941
