In [2]:
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/LinkedInLearning/faster-pandas-2832038/main/Ch02/02_02/cart.csv')
df

Unnamed: 0,Customer,Item,Amount,Item Price
0,Rick,Wine,20,103.2
1,Morty,Almond Milk,1,10.04
2,Summer,Ice Cream,1,8.32
3,Beth,Comb,1,7.3
4,Jerry,Tequila,2,20.34


In [6]:
mask = df['Item Price'] > 10

In [8]:
df[mask]

Unnamed: 0,Customer,Item,Amount,Item Price
0,Rick,Wine,20,103.2
1,Morty,Almond Milk,1,10.04
4,Jerry,Tequila,2,20.34


In [11]:
conn = sqlite3.connect('logs.db', detect_types=sqlite3.PARSE_DECLTYPES)
sql1 = 'SELECT * FROM logs'

In [12]:
dff = pd.read_sql(sql1, conn)
dff

Unnamed: 0,time,origin,method,path,status_code,size
0,1995-08-01 00:00:01,in24.inetnebr.com,GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,200,1839
1,1995-08-01 00:00:07,uplherc.upl.com,GET,/,304,0
2,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/ksclogo-medium.gif,304,0
3,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/MOSAIC-logosmall.gif,304,0
4,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/USA-logosmall.gif,304,0
...,...,...,...,...,...,...
9995,1995-08-01 07:54:40,ppp-14.flashnet.it,GET,/images/USA-logosmall.gif,200,234
9996,1995-08-01 07:54:41,204.238.216.51,GET,/images/ksclogo-medium.gif,304,0
9997,1995-08-01 07:54:42,204.238.216.51,GET,/images/USA-logosmall.gif,304,0
9998,1995-08-01 07:54:42,204.238.216.51,GET,/images/MOSAIC-logosmall.gif,304,0


In [16]:
%%timeit
tot = 0
for x,row in dff.iterrows():
    if row['status_code'] >=400:
        tot+= 1


2.77 s ± 459 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit dff[dff['status_code'] >=400] #boolean indexing

1.27 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Universal Functions

In [19]:
s =pd.Series(range(1000))
%timeit max(s)

283 µs ± 69.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [20]:
%timeit s.max() #pandas max function faster

108 µs ± 20.6 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## limitations of appending

In [2]:
def parse_time(ts):
    # [02/Jul/1995:16:30:08 -0400] for converting string time to datetime format
    time = datetime.strptime(ts, '[%d/%b/%Y:%H:%M:%S %z]')
    return time.replace(tzinfo=None)  # Remove time zone

def parse_line(line): #for parsing the line
    fields = line.split()
    size = 0 if fields[-1] == '-' else int(fields[-1])
    return {
        'origin': fields[0],
        'time': parse_time(fields[3] + ' ' + fields[4]),
        'method': fields[5][1:],  # Remove leading "
        'path': fields[6],
        'status_code': int(fields[-2]),
        'size': size,
    }

In [23]:
with open ("log.txt", "r") as fp:
    lines = fp.readlines()

In [27]:
%%timeit
df = pd.DataFrame()
for line in lines:
    df = df.append(parse_line(line), ignore_index= True)

1.82 s ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# Dataframes are slow with append, python lists are very fast.
# from records to convert DataFrame to a NumPy record array
%timeit df = pd.DataFrame.from_records(parse_line(line) for line in lines) 

25.8 ms ± 707 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
df = pd.DataFrame.from_records(parse_line(line) for line in lines)
df

Unnamed: 0,origin,time,method,path,status_code,size
0,in24.inetnebr.com,1995-08-01 00:00:01,GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,200,1839
1,uplherc.upl.com,1995-08-01 00:00:07,GET,/,304,0
2,uplherc.upl.com,1995-08-01 00:00:08,GET,/images/ksclogo-medium.gif,304,0
3,uplherc.upl.com,1995-08-01 00:00:08,GET,/images/MOSAIC-logosmall.gif,304,0
4,uplherc.upl.com,1995-08-01 00:00:08,GET,/images/USA-logosmall.gif,304,0
...,...,...,...,...,...,...
995,www-d1.proxy.aol.com,1995-08-01 00:36:25,GET,/elv/vidpicp.htm,200,4251
996,pme609.onramp.awinc.com,1995-08-01 00:36:26,GET,/shuttle/resources/orbiters/discovery-logo.gif,200,4179
997,marimo.kushiro-ct.ac.jp,1995-08-01 00:36:27,GET,/shuttle/countdown/count70.gif,200,46573
998,in24.inetnebr.com,1995-08-01 00:36:28,GET,/shuttle/missions/sts-59/news/,200,6122


In [4]:
size = 50000
df = pd.DataFrame({
                'a': np.random.randint(1,1000, size),
                'b': np.random.randint(1,1000, size),
                'c': np.random.randint(1,1000, size),
})

In [5]:
%%timeit
total = 0
for _, row in df.iterrows():
    total+= row.max()

17.3 s ± 2.31 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit df.apply(np.max, axis =1, raw = True).sum() #raw supplies the row as numpy array to sum()

1.15 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
conn = sqlite3.connect('data3/logs.db', detect_types=sqlite3.PARSE_DECLTYPES)
sql1 = 'SELECT * FROM logs'

dff = pd.read_sql(sql1, conn)
dff

Unnamed: 0,time,origin,method,path,status_code,size
0,1995-08-01 00:00:01,in24.inetnebr.com,GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,200,1839
1,1995-08-01 00:00:07,uplherc.upl.com,GET,/,304,0
2,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/ksclogo-medium.gif,304,0
3,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/MOSAIC-logosmall.gif,304,0
4,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/USA-logosmall.gif,304,0
...,...,...,...,...,...,...
9995,1995-08-01 07:54:40,ppp-14.flashnet.it,GET,/images/USA-logosmall.gif,200,234
9996,1995-08-01 07:54:41,204.238.216.51,GET,/images/ksclogo-medium.gif,304,0
9997,1995-08-01 07:54:42,204.238.216.51,GET,/images/USA-logosmall.gif,304,0
9998,1995-08-01 07:54:42,204.238.216.51,GET,/images/MOSAIC-logosmall.gif,304,0


In [9]:
with open('data3/ips.txt', 'r') as fp:
    ips = [line.strip() for line in fp]

In [18]:
%timeit dff['origin'].isin(ips)

41.6 ms ± 7.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
ips_set = set(ips)
%timeit dff['origin'].isin(ips_set)

101 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
%timeit dff['origin'].apply(lambda v: v in ips_set)

6.25 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
is_bad = ips_set.__contains__
%timeit dff['origin'].apply(is_bad)

3.48 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [31]:
df = pd.read_csv('logs.csv', parse_dates= ['time'])
df

Unnamed: 0,origin,time,method,path,status_code,size
0,uplherc.upl.com,1995-08-01 00:00:10,GET,/images/WORLD-logosmall.gif,304,0
1,133.43.96.45,1995-08-01 00:00:25,GET,/history/apollo/images/apollo-logo1.gif,200,1173
2,133.68.18.180,1995-08-01 00:01:13,GET,/persons/nasa-cm/jmd-sm.gif,200,3660
3,www-d3.proxy.aol.com,1995-08-01 00:01:28,GET,/images/NASA-logosmall.gif,200,786
4,haraway.ucet.ufl.edu,1995-08-01 00:04:47,GET,"/cgi-bin/imagemap/countdown70?199,165",302,97
...,...,...,...,...,...,...
49995,alpc6.mpimf-heidelberg.mpg.de,1995-08-31 23:53:54,GET,/htbin/cdt_main.pl,200,3873
49996,cindy.yamato.ibm.co.jp,1995-08-31 23:54:40,GET,/images/kscmap-tiny.gif,200,2537
49997,cys-cap-9.wyoming.com,1995-08-31 23:55:01,GET,/shuttle/missions/sts-71/movies/movies.html,200,3381
49998,203.243.250.7,1995-08-31 23:55:10,GET,/facilities/vab.html,200,4045


In [29]:
df['time'][32]

def is_morning(ts):
    t = pd.to_datetime(ts)
    return t.hour >=6 and t.hour <12 #to return the morning time from log files

In [30]:
%timeit df[df['time'].apply(is_morning)] #not a good choice and is slow as each time str is passed when function calls so use
#parse_dates in pd.read_csv

14.3 s ± 1.13 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%timeit df[(df['time'].dt.hour >=6) & (df['time'].dt.hour <12)]

38.2 ms ± 5.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Find how many rides in 2016 were in the afternoon of weekend or holiday.
- Afternoon: Between noon to 6pm
- Weekend: Saturday or Sunday
- Holiday: See holidays_2016 below

In [85]:
# 2016 public holidays

from calendar import SATURDAY, SUNDAY
holidays_2016 = [
    '2016-01-01',  # new year
    '2016-01-18',  # MLK
    '2016-05-30',  # memorial
    '2016-07-04',  # independence
    '2016-09-05',  # labor
    '2016-11-11',  # veterans
    '2016-11-24',  # thanksgiving
    '2016-12-26',  # christmas
]


def load_df(file_name):
    """Load data from CSV to DataFrame"""
    return pd.read_csv(
        file_name,
        parse_dates={'time': ['Checkout Date', 'Checkout Time']},
    )


def vacation_rides(df):
    """Return only rows that are in holiday afternoon"""
    mask_2016 = df['time'].dt.year == 2016
    
    holiday_mask = (
        (df['time'].dt.floor('d').isin(holidays_2016)) |
        (df['time'].dt.weekday.isin([SATURDAY, SUNDAY]))
    )

    afternoon_mask = (df['time'].dt.hour >=12) & (df['time'].dt.hour <6)

    return df[mask_2016 & holiday_mask & afternoon_mask]

In [66]:
"""df = pd.read_csv('data3/austin-bikes.csv')
df['time'] = pd.to_datetime(df['Checkout Date'] + ' ' + df['Checkout Time'])
df.drop(['Checkout Date', 'Checkout Time'], axis = 1, inplace = True)
df.head(2)"""

"df = pd.read_csv('data3/austin-bikes.csv')\ndf['time'] = pd.to_datetime(df['Checkout Date'] + ' ' + df['Checkout Time'])\ndf.drop(['Checkout Date', 'Checkout Time'], axis = 1, inplace = True)\ndf.head(2)"

In [87]:
df = load_df('data3/austin-bikes.csv')

In [88]:
%timeit vacation_rides(df)

24 ms ± 2.81 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Pandas Performance

In [89]:
df = pd.read_csv('data3/austin-bikes.csv')
df.head(2)

Unnamed: 0,Trip ID,Membership Type,Bicycle ID,Checkout Date,Checkout Time,Checkout Kiosk ID,Checkout Kiosk,Return Kiosk ID,Return Kiosk,Trip Duration Minutes,Month,Year
0,9900334132,24-Hour Kiosk (Austin B-cycle),839,12/22/2013,13:12:00,2495.0,4th & Congress,2495,4th & Congress,53,12.0,2013.0
1,9900334089,24-Hour Kiosk (Austin B-cycle),283,12/22/2013,10:12:00,2501.0,5th & Bowie,2495,4th & Congress,9,12.0,2013.0
