In [2]:
# Importing Dependencies
import pandas as pd
import numpy as np


In [3]:
# Creating reference to CSV file
kushy_prod_data_path = "products-kushy_api.2017-11-14.csv"

# Importing the CSV into a pandas DataFrame
kushy_prod_df = pd.read_csv(kushy_prod_data_path, low_memory=False)

# Looking at the top of the df to get a feel for the data
kushy_prod_df.head(10)

Unnamed: 0,id,name,slug,brand,category,strain,thc,cbd,lab_test
0,1,Pre-Roll Package - Pre-roll,the-humboldt-cure-pre-roll-package-pre-roll,The Humboldt Cure,"Flowers,Pre-Roll",,,,
1,2,ounce of cannabis,the-humboldt-cure-ounce-of-cannabis,The Humboldt Cure,,,,,
2,3,1/2 ounce of cannabis,the-humboldt-cure-1-2-ounce-of-cannabis,The Humboldt Cure,,,,,
3,4,1/4 ounce of cannabis,the-humboldt-cure-1-4-ounce-of-cannabis,The Humboldt Cure,,,,,
4,5,1/8,the-humboldt-cure-1-8,The Humboldt Cure,,,,,
5,6,Nectar Box-The Humboldt Cure,the-humboldt-cure-nectar-box-the-humboldt-cure,The Humboldt Cure,,,,,
6,7,The Humboldt Cure - Pre-roll,the-humboldt-cure-the-humboldt-cure-pre-roll,The Humboldt Cure,"Flowers,Pre-Roll",,,,
7,8,CBD-Rich Recovery Salve - 5ml Sample,turtle-bud-cbd-rich-recovery-salve-5ml-sample,Turtle Bud,"Topical,",,5mg,35mg,
8,9,CBD-Rich Recovery Salve,turtle-bud-cbd-rich-recovery-salve,Turtle Bud,"Topical,",,30mg,210mg,
9,10,Lift Tickets Gucci OG Rosin,lift-tickets-lift-tickets-gucci-og-rosin,Lift Tickets,,,,,


In [4]:
# Getting some preliminary descriptive statistics for the columns in the df
kushy_prod_df.describe(include="all")


Unnamed: 0,id,name,slug,brand,category,strain,thc,cbd,lab_test
count,17233.0,17232,17233,17232,16062,17021,4328,785,236
unique,,11976,17233,1297,23,2582,462,265,193
top,,Sour Diesel - Shatter,edipure-edipuffs-100mg,Unknown Producer,"Concentrate,Shatter,",ACDC (AC/DC),100mg,10mg,https://www.cannabisreports.com/lab-tests/5/58...
freq,,74,1,2793,2801,1270,503,49,8
mean,8617.0,,,,,,,,
std,4974.88293,,,,,,,,
min,1.0,,,,,,,,
25%,4309.0,,,,,,,,
50%,8617.0,,,,,,,,
75%,12925.0,,,,,,,,


In [4]:
# Looking into the most popular strain
strain_count = kushy_prod_df["strain"].value_counts()

# Converting <strain_count> (i.e. the strain value_counts() series) to a df
strain_count_df1 = strain_count.to_frame()

# Resetting the <strain_count> df's index to numbers
strain_count_df2 = strain_count_df1.reset_index()

# Renaming the <strain_count> df's to appropriate ones
strain_count_df3 = strain_count_df2.rename(columns={"index": "Strain",
                                        "strain": "Count",
                                        })

strain_count_df4 = strain_count_df3.loc[strain_count_df3["Count"] >= 60 , :] # set level to 60 (51%) or 16 (68%)


strain_count_df4.describe().round(2)

Unnamed: 0,Count
count,44.0
mean,195.41
std,224.88
min,60.0
25%,69.0
50%,104.5
75%,207.75
max,1270.0


In [5]:
# Percent reduction in n of rows(cases) between dfs 3 and 4
(1 - len(strain_count_df4) / len(strain_count_df3)) * 100


98.29589465530596

In [6]:

(strain_count_df4["Count"].sum() / strain_count_df3["Count"].sum()).round(2)


0.51

In [7]:
# n of rows(cases) in final df
len(strain_count_df4)


44

In [25]:
# Notebook Setup
%matplotlib notebook

# Dependencies
import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

# sns.kdeplot(strain_count_df3["Count"],
#             bw="silverman",
#             shade=True,
#             #cut=2,
#             label="Bandwidth = Silverman",
            
#            );

# sns.rugplot(strain_count_df3["Count"]);

g = sns.barplot(x="Strain", y="Count", data=strain_count_df4,)
g.set_xticklabels(g.get_xticklabels(), rotation=90,)
plt.gcf().subplots_adjust(bottom=0.60)

plt.savefig('strains.png', dpi=400)

<IPython.core.display.Javascript object>

In [195]:
strain_count_df4

Unnamed: 0,Strain,Count
0,ACDC (AC/DC),1270
1,OG Kush,703
2,Blue Dream,692
3,Girl Scout Cookies,542
4,Harlequin,302
5,Sour Tsunami,297
6,Sour Diesel,286
7,Granddaddy Purple (GDP),284
8,Cali Orange,277
9,Jack Herer,274


In [186]:
# # Alternate cut (cuts bottom AND top outliers to get 51% differntly)
# strain_count_df5 = strain_count_df3.loc[strain_count_df3["Count"] >= 30 , :]
# strain_count_df6 = strain_count_df5.loc[strain_count_df5["Count"] <= 800 , :]


# strain_count_df6.describe().round(2)

In [48]:
kushy_prod_df["brand"].value_counts()

Unknown Producer                       2793
Moxie Seeds & Extracts                  180
Golden XTRX                             167
Baked                                   162
PopNaturals                             145
Rumpelstiltskin Extracts (Rump Wax)     143
IncrediMeds                             131
Glowing Buddha                          124
Natural Cannabis Company                116
O.pen VAPE                              113
Gold Drop                               111
Nectars 710                             107
Suspiciously Delicious                  101
Canna Magic                              97
Heavenly Sweet                           96
Delta 11                                 95
DankMan                                  92
Bhang                                    92
Waxology                                 89
EdiPure                                  88
The CO2 Company                          86
Om Extracts                              84
HGH Extractions                 

In [42]:
kushy_prod_df["category"].value_counts()

# Looking into the most popular category
category_count = kushy_prod_df["category"].value_counts()

# Converting <category_count> (i.e. the category value_counts() series) to a df
category_count_df1 = category_count.to_frame()

# # Exporting a csv of the df
# category_count_df1.to_csv("category_counts.csv")

category_count_df1

Unnamed: 0,category
"Concentrate,Shatter,",2801
"Concentrate,Wax,",2151
"Concentrate,Oil,",1664
"Edibles,Candy,",1443
"Vapes,Vape Cartidge,",1300
"Edibles,",890
"Edibles,Snack,",856
"Concentrate,Bubble Hash,",756
"Edibles,Chocolate,",723
"Concentrate,Crumble,Wax,",684


In [47]:
# deciding the cut off
category_count_df2 = category_count_df1.loc[category_count_df1["category"] >= 800 , :] 


In [48]:
# Calculating  how representative the 2nd df is of the 1st df through the proportion of their count volumes 
(category_count_df2["category"].sum() / category_count_df1["category"].sum()).round(2)


0.69

In [49]:
category_count_df2

Unnamed: 0,category
"Concentrate,Shatter,",2801
"Concentrate,Wax,",2151
"Concentrate,Oil,",1664
"Edibles,Candy,",1443
"Vapes,Vape Cartidge,",1300
"Edibles,",890
"Edibles,Snack,",856


In [4]:
thc_count = kushy_prod_df["thc"].value_counts()

thc_count_df1 = thc_count.to_frame()

thc_count_df2 = thc_count_df1.reset_index()

thc_count_df3 = thc_count_df2.rename(columns={"index": "THC",
                                        "thc": "Count",
                                        })

thc_count_df3

# sns.barplot(x="THC", y="Count", data=thc_count_df3,)

Unnamed: 0,THC,Count
0,100mg,503
1,50mg,272
2,200mg,251
3,150mg,187
4,60mg,177
5,180mg,174
6,300mg,165
7,120mg,142
8,250mg,137
9,25mg,119


In [28]:
thc_count_df4 = thc_count_df3.loc[thc_count_df3["Count"] >= 25, :]

In [29]:
(thc_count_df4["Count"].sum() / thc_count_df3["Count"].sum()).round(2)

0.76

In [188]:
thc_count_df3["THC"].str.contains("%")

thc_count_df_percents = thc_count_df3.loc[thc_count_df3["THC"].str.contains("%") == True, :]

thc_count_df_percents.head()

Unnamed: 0,THC,Count
64,60%,8
70,50%,7
96,67%,4
105,30%,3
106,70%,3


In [187]:
thc_count_df3["THC"].str.contains("mg")

thc_count_df_mgs = thc_count_df3.loc[thc_count_df3["THC"].str.contains("mg") == True, :]

thc_count_df_mgs.head()

Unnamed: 0,THC,Count
0,100mg,503
1,50mg,272
2,200mg,251
3,150mg,187
4,60mg,177
