In [96]:
import pandas as pd
import numpy as np

In [97]:
rentals = pd.read_csv("dvd_rentals.rental.csv")
inventory = pd.read_csv("dvd_rentals.inventory.csv")
film = pd.read_csv("dvd_rentals.film.csv")
actor = pd.read_csv("dvd_rentals.actor.csv")
film_category = pd.read_csv("dvd_rentals.film_category.csv")
category = pd.read_csv("dvd_rentals.category.csv")
film_actor = pd.read_csv("dvd_rentals.film_actor.csv")

In [98]:
# check out last notebook on joining multipile tables for instructions how to get here
Total_movies = rentals[["customer_id","inventory_id","rental_date"]].sort_values("customer_id")
Total_movies = Total_movies.merge(inventory[["inventory_id","film_id"]],\
                                                how = "left",left_on = "inventory_id", right_on ="inventory_id",)
Total_movies = Total_movies.merge(film[["film_id","title"]], how = "left", left_on = "film_id", right_on = \
                                               "film_id") 
Total_movies = Total_movies.merge(film_category[["film_id","category_id"]],how = "left",\
                                               left_on = "film_id", right_on = "film_id")
Total_movies = Total_movies.merge(category[["category_id","name"]], how = "left", left_on = "category_id", \
                                                right_on = "category_id")

Total_movies = Total_movies[["customer_id","title","name","rental_date"]]
Total_movies["rental_date"] = Total_movies["rental_date"].astype('M8[D]')
Total_movies

Unnamed: 0,customer_id,title,name,rental_date
0,1,CONFIDENTIAL INTERVIEW,Music,2005-07-09
1,1,CLOSER BANG,Comedy,2005-06-16
2,1,MINDS TRUMAN,Action,2005-08-17
3,1,AMISTAD MIDSUMMER,New,2005-07-29
4,1,MUSKETEERS WAIT,Classics,2005-06-15
...,...,...,...,...
16039,599,DAISY MENAGERIE,Sci-Fi,2005-05-31
16040,599,LOLA AGENT,Horror,2005-07-30
16041,599,HURRICANE AFFAIR,Comedy,2005-08-17
16042,599,TEMPLE ATTRACTION,Horror,2005-08-23


In [99]:
# Core Calculated Fields
rentbycat = Total_movies.groupby(["customer_id","name"]).agg({"title" : 'count', "rental_date" : "max"}).reset_index().\
rename(columns = {"title" : "rental_count","name" : "category_name"}).sort_values(["customer_id","rental_count"\
                                                                                   ,"rental_date"]\
                                                                                  ,ascending = [True,False,False])

rentbycat["Rank"] = rentbycat.groupby("customer_id")["rental_count"].rank("first", ascending = False)
rentbycat


Unnamed: 0,customer_id,category_name,rental_count,rental_date,Rank
2,1,Classics,6,2005-08-19,1.0
3,1,Comedy,5,2005-08-22,2.0
5,1,Drama,4,2005-08-18,3.0
1,1,Animation,2,2005-08-22,4.0
11,1,Sci-Fi,2,2005-08-21,5.0
...,...,...,...,...,...
7731,599,Classics,1,2005-08-21,7.0
7736,599,Games,1,2005-08-21,8.0
7739,599,Sports,1,2005-07-31,9.0
7733,599,Drama,1,2005-07-12,10.0


In [100]:
# Average rental count per category
averagerentbycat = Total_movies.rename(columns = {"name" : "category_name"})
averagerentbycat = averagerentbycat.groupby(["customer_id","category_name"]).\
agg(count = ("category_name", "count")).reset_index()
averagerentbycat = averagerentbycat.groupby("category_name").agg(average = ("count", "mean")).reset_index()
averagerentbycat["average"] = np.floor(averagerentbycat["average"])
averagerentbycat

Unnamed: 0,category_name,average
0,Action,2.0
1,Animation,2.0
2,Children,1.0
3,Classics,2.0
4,Comedy,1.0
5,Documentary,2.0
6,Drama,2.0
7,Family,2.0
8,Foreign,2.0
9,Games,2.0


In [101]:
# total customer rentals
totalcustrent = Total_movies.groupby("customer_id").agg(total_rental_count = ("customer_id", "count")).reset_index()
totalcustrent

Unnamed: 0,customer_id,total_rental_count
0,1,32
1,2,27
2,3,26
3,4,22
4,5,38
...,...,...
594,595,30
595,596,28
596,597,25
597,598,22


In [102]:
# calculating percentile with rank pct = true, somewhat different than results on postgres but very close
# see next cell for the tests i did on the result
rentbycat['percentile'] = 1 - rentbycat.groupby(['category_name'])['rental_count']\
                        .rank(method = 'first', pct = True)

rentbycat

Unnamed: 0,customer_id,category_name,rental_count,rental_date,Rank,percentile
2,1,Classics,6,2005-08-19,1.0,0.002137
3,1,Comedy,5,2005-08-22,2.0,0.020202
5,1,Drama,4,2005-08-18,3.0,0.137725
1,1,Animation,2,2005-08-22,4.0,0.666000
11,1,Sci-Fi,2,2005-08-21,5.0,0.637081
...,...,...,...,...,...,...
7731,599,Classics,1,2005-08-21,7.0,0.589744
7736,599,Games,1,2005-08-21,8.0,0.599156
7739,599,Sports,1,2005-07-31,9.0,0.684008
7733,599,Drama,1,2005-07-12,10.0,0.614770


In [103]:
# checking the results
mask1 = rentbycat['rental_count'] >= 6
mask2 = rentbycat['category_name'] == 'Classics'
lenaboveorequal = len(rentbycat[mask2 & mask1])
print(lenaboveorequal)

mask1 = rentbycat['rental_count'] < 6
mask2 = rentbycat['category_name'] == 'Classics'
lenbelow = len(rentbycat[mask2 & mask1])
print(lenbelow)

1 - lenbelow / (lenbelow + lenaboveorequal) # 0.004273504273504258 in comparison to 0.002137

2
466


0.004273504273504258

In [104]:
# checking the results
mask1 = rentbycat['rental_count'] >= 5
mask2 = rentbycat['category_name'] == 'Comedy'
lenaboveorequal = len(rentbycat[mask2 & mask1])
print(lenaboveorequal)

mask1 = rentbycat['rental_count'] < 5
mask2 = rentbycat['category_name'] == 'Comedy'
lenbelow = len(rentbycat[mask2 & mask1])
print(lenbelow)

1- lenbelow/(lenbelow + lenaboveorequal) # 0.022222222222222254 in comparison to 0.020202

11
484


0.022222222222222254

In [105]:
# checking the results
mask1 = rentbycat['rental_count'] >= 4
mask2 = rentbycat['category_name'] == 'Drama'
lenaboveorequal = len(rentbycat[mask2 & mask1])
print(lenaboveorequal)

mask1 = rentbycat['rental_count'] < 4
mask2 = rentbycat['category_name'] == 'Drama'
lenbelow = len(rentbycat[mask2 & mask1])
print(lenbelow)

1- lenbelow/(lenbelow + lenaboveorequal) # 0.13972055888223556 in comparison to 0.137725

70
431


0.13972055888223556

In [106]:
# rounding up the percentile 
rentbycat['percentile_rounded'] = np.ceil(rentbycat['percentile'] * 100)
top_percentile = rentbycat[["customer_id","category_name","percentile_rounded"]]

In [107]:
top_percentile

Unnamed: 0,customer_id,category_name,percentile_rounded
2,1,Classics,1.0
3,1,Comedy,3.0
5,1,Drama,14.0
1,1,Animation,67.0
11,1,Sci-Fi,64.0
...,...,...,...
7731,599,Classics,59.0
7736,599,Games,60.0
7739,599,Sports,69.0
7733,599,Drama,62.0


In [108]:
# thats it, we're done. it's time to tie everything up.
firstjoin = rentbycat[["customer_id","category_name","rental_count","Rank"]].merge(totalcustrent[["customer_id","total_rental_count"]]\
                                                                            , how = "left", left_on = "customer_id", \
                                                                           right_on = "customer_id")
firstjoin                                                                            

Unnamed: 0,customer_id,category_name,rental_count,Rank,total_rental_count
0,1,Classics,6,1.0,32
1,1,Comedy,5,2.0,32
2,1,Drama,4,3.0,32
3,1,Animation,2,4.0,32
4,1,Sci-Fi,2,5.0,32
...,...,...,...,...,...
7736,599,Classics,1,7.0,19
7737,599,Games,1,8.0,19
7738,599,Sports,1,9.0,19
7739,599,Drama,1,10.0,19


In [109]:
secondjoin = firstjoin.merge(averagerentbycat, how = "left", left_on = "category_name", \
                             right_on = "category_name")
secondjoin

Unnamed: 0,customer_id,category_name,rental_count,Rank,total_rental_count,average
0,1,Classics,6,1.0,32,2.0
1,1,Comedy,5,2.0,32,1.0
2,1,Drama,4,3.0,32,2.0
3,1,Animation,2,4.0,32,2.0
4,1,Sci-Fi,2,5.0,32,2.0
...,...,...,...,...,...,...
7736,599,Classics,1,7.0,19,2.0
7737,599,Games,1,8.0,19,2.0
7738,599,Sports,1,9.0,19,2.0
7739,599,Drama,1,10.0,19,2.0


In [110]:
thirdjoin = secondjoin.merge(top_percentile, how = "left", \
                            left_on = ["customer_id","category_name"],\
                            right_on = ["customer_id","category_name"])

In [111]:
# all ready for the marketing team.
thirdjoin["average_comparison"] = thirdjoin["rental_count"] - thirdjoin["average"] 
thirdjoin[thirdjoin["Rank"] <= 2]

Unnamed: 0,customer_id,category_name,rental_count,Rank,total_rental_count,average,percentile_rounded,average_comparison
0,1,Classics,6,1.0,32,2.0,1.0,4.0
1,1,Comedy,5,2.0,32,1.0,3.0,4.0
14,2,Sports,5,1.0,27,2.0,7.0,3.0
15,2,Classics,4,2.0,27,2.0,11.0,2.0
27,3,Action,4,1.0,26,2.0,14.0,2.0
...,...,...,...,...,...,...,...,...
7705,597,Children,3,2.0,25,1.0,9.0,2.0
7717,598,Foreign,3,1.0,22,2.0,12.0,1.0
7718,598,Drama,3,2.0,22,2.0,14.0,1.0
7730,599,Horror,3,1.0,19,1.0,8.0,2.0
