# Dictionary Method


In [1]:
import pandas as pd
import numpy as np

from gensim.parsing import remove_stopwords, strip_numeric, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.porter import PorterStemmer

## Applying to strain name + description

### Keep stop words and no stemming

In [2]:
dictionary = pd.read_excel('data/dictionary.xlsx')
dictionary['word'] = dictionary['word'].astype(str).str.lower()
dict_words_by_label = dictionary.groupby('dictionary_label')['word'].apply(list)
dict_words_by_label

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [3]:
full_dataset = pd.read_csv('data/full_dataset.csv')
full_dataset['wmsite'] = full_dataset['wmsite'].astype(str).str.lower()
full_dataset['scrape'] = full_dataset['scrape'].astype(str).str.lower()
full_dataset.head()

  full_dataset = pd.read_csv('data/full_dataset.csv')


Unnamed: 0,v1,address,city,description,email,price_ounce,product_id,published,scrape_number,slug,...,thc,test_expires,has_photo,photo_filename,pageviews,ratecnt,dateupdated,delivery,rec,v39
0,2651.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.9%-24.9% THC Shake.,recreational365@yahoo.com,65.0,31511407,True,448,365-recreational-cannabis,...,,,,,,,,,,
1,2652.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.4%-24.4% THC,recreational365@yahoo.com,0.0,31457256,True,448,365-recreational-cannabis,...,,,,,,,,,,
2,2653.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,30833338,True,448,365-recreational-cannabis,...,,,,,,,,,,
3,2654.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,14.0%-18.0% THC,recreational365@yahoo.com,210.0,31453769,True,448,365-recreational-cannabis,...,,,,,,,,,,
4,2655.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,28199355,True,448,365-recreational-cannabis,...,,,,,,,,,,


In [4]:
full_dataset['slug'].unique()

array(['365-recreational-cannabis', '420-friendly', '420-spot-shop',
       'have-a-heart-belltown', 'have-a-heart-bothell',
       'have-a-heart-cafe', 'have-a-heart-ocean-shores-2',
       'have-a-heart-skyway', 'always-greener-downtown', 'american-mary',
       'botany-bay', 'diamond-green', 'emerald-haze-cannabis-emporium',
       'tacoma-greenthumb-port-orchard', 'fillabong',
       'freedom-market-cathlamet', 'iwaco-freedom-market',
       'freedom-market-recreational',
       'longview-freedom-market-recreational', 'green-token-cannabis',
       'growers-outlet', 'gypsy-greens', 'gypsy-greens-3',
       'higher-leaf-3', 'indras-planet-11',
       'kush21-burien-s-first-pot-shop', 'kushman', 'kushman-s',
       'kushman-s-mukilteo', 'kushmart-south-everett', 'lucid-auburn',
       'lucid-puyallup', 'magu-cbd', 'marijuana-mart-2',
       'marley-420-recreational-marijuana', 'wm-demo-dispensary',
       'wm-demo-delivery', 'westside-420-recreational',
       'uncle-ando-s-wurld-of-

In [5]:
def count_word_frequency_using_set(sentence, dictionary, sep=" "):
    words = sentence.split(sep)  # Split the string into a list of words
    freq_sum = 0

    for word in dictionary:

        # Count the occurrences of the word in the original sentence
        freq_sum += words.count(word)
    
    return freq_sum

def count_total_word(sentence):
    words = sentence.split()
    return len(words)

In [6]:
full_dataset['straindescription'] = '"' + full_dataset['strain'].astype(str) + '" -- '+ full_dataset['description'].astype(str)
full_dataset['straindescription'] = full_dataset['straindescription'].astype(str).str.lower().apply(strip_multiple_whitespaces)
full_dataset['intx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['intx_dict'])
full_dataset['medx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['medx_dict'])
full_dataset['wellx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['wellx_dict'])
full_dataset['bad_cann_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['bad_cann_dict'])
full_dataset['good_cann_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['good_cann_dict'])
full_dataset['total_word_count'] = full_dataset['straindescription'].apply(count_total_word)

In [7]:
full_dataset['intx_dict_count_prop'] = full_dataset['intx_dict_count'] / full_dataset['total_word_count']
full_dataset['medx_dict_count_prop'] = full_dataset['medx_dict_count'] / full_dataset['total_word_count']
full_dataset['wellx_dict_count_prop'] = full_dataset['wellx_dict_count'] / full_dataset['total_word_count']
full_dataset['bad_cann_dict_count_prop'] = full_dataset['bad_cann_dict_count'] / full_dataset['total_word_count']
full_dataset['good_cann_dict_count_prop'] = full_dataset['good_cann_dict_count'] / full_dataset['total_word_count']

In [8]:
full_dataset['is_commoditization'] = ((full_dataset['total_word_count'] < 15) & 
                                      (full_dataset['intx_dict_count'] == 0) & 
                                      (full_dataset['medx_dict_count'] == 0) &
                                      (full_dataset['wellx_dict_count'] == 0) &
                                    #   (full_dataset['bad_cann_dict_count'] == 0) &
                                    #   (full_dataset['good_cann_dict_count'] == 0) & 
                                      full_dataset['straindescription'].str.contains(r"(thc|cbd|%)"))
                                    #   full_dataset['straindescription'].str.contains(r"\d+")) 

full_dataset['is_empty'] = ((full_dataset['total_word_count'] < 5) &
                            (full_dataset['intx_dict_count'] == 0) &
                            (full_dataset['medx_dict_count'] == 0) &
                            (full_dataset['wellx_dict_count'] == 0) &
                            (~ full_dataset['straindescription'].str.contains(r"(thc|cbd|%)")))

  full_dataset['straindescription'].str.contains(r"(thc|cbd|%)"))
  (~ full_dataset['straindescription'].str.contains(r"(thc|cbd|%)")))


In [9]:
full_dataset.head()

Unnamed: 0,v1,address,city,description,email,price_ounce,product_id,published,scrape_number,slug,...,bad_cann_dict_count,good_cann_dict_count,total_word_count,intx_dict_count_prop,medx_dict_count_prop,wellx_dict_count_prop,bad_cann_dict_count_prop,good_cann_dict_count_prop,is_commoditization,is_empty
0,2651.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.9%-24.9% THC Shake.,recreational365@yahoo.com,65.0,31511407,True,448,365-recreational-cannabis,...,0,0,8,0.0,0.0,0.0,0.0,0.0,True,False
1,2652.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.4%-24.4% THC,recreational365@yahoo.com,0.0,31457256,True,448,365-recreational-cannabis,...,0,0,8,0.0,0.0,0.0,0.0,0.0,True,False
2,2653.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,30833338,True,448,365-recreational-cannabis,...,0,0,8,0.0,0.0,0.0,0.0,0.0,True,False
3,2654.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,14.0%-18.0% THC,recreational365@yahoo.com,210.0,31453769,True,448,365-recreational-cannabis,...,0,0,8,0.0,0.0,0.0,0.0,0.0,True,False
4,2655.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,28199355,True,448,365-recreational-cannabis,...,0,0,9,0.0,0.0,0.0,0.0,0.0,True,False


In [10]:
label_obs = full_dataset.groupby(['wmsite', 'scrape'])['straindescription'].count()

label_total = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count', 'medx_dict_count', 'wellx_dict_count', 
                                                          'bad_cann_dict_count', 'good_cann_dict_count', 'total_word_count', 
                                                          'is_commoditization', 'is_empty']].sum()
label_total.columns = list(map(lambda x: x + "_total", label_total.columns))
label_total['intx_dict_count_total_prop'] = label_total['intx_dict_count_total'] / label_total['total_word_count_total']
label_total['medx_dict_count_total_prop'] = label_total['medx_dict_count_total'] / label_total['total_word_count_total']
label_total['wellx_dict_count_total_prop'] = label_total['wellx_dict_count_total'] / label_total['total_word_count_total']
label_total['bad_cann_dict_count_total_prop'] = label_total['bad_cann_dict_count_total'] / label_total['total_word_count_total']
label_total['good_cann_dict_count_total_prop'] = label_total['good_cann_dict_count_total'] / label_total['total_word_count_total']

label_avg = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count_prop', 'medx_dict_count_prop', 'wellx_dict_count_prop', 
                                                        'bad_cann_dict_count_prop', 'good_cann_dict_count_prop']].mean()
label_avg.columns = list(map(lambda x: x + "_avg", label_avg.columns))

In [11]:
final = label_total.join(label_avg).join(label_obs)
final = final.rename(columns={'straindescription': 'num_obs_total'})
final['is_commoditization_prop'] = final['is_commoditization_total'] / final['num_obs_total']
final['is_empty_prop'] = final['is_empty_total'] / final['num_obs_total']
# final = final.reset_index()
final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,intx_dict_count_total,medx_dict_count_total,wellx_dict_count_total,bad_cann_dict_count_total,good_cann_dict_count_total,total_word_count_total,is_commoditization_total,is_empty_total,intx_dict_count_total_prop,medx_dict_count_total_prop,...,bad_cann_dict_count_total_prop,good_cann_dict_count_total_prop,intx_dict_count_prop_avg,medx_dict_count_prop_avg,wellx_dict_count_prop_avg,bad_cann_dict_count_prop_avg,good_cann_dict_count_prop_avg,num_obs_total,is_commoditization_prop,is_empty_prop
wmsite,scrape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
http://weedmaps.com/deliveries/all-time-high,101,14,7,25,1,4,1162,0,0,0.012048,0.006024,...,0.000861,0.003442,0.010953,0.004926,0.018264,0.000722,0.002713,18,0.0,0.0
http://weedmaps.com/deliveries/all-time-high,127,17,8,29,2,6,1588,0,0,0.010705,0.005038,...,0.001259,0.003778,0.009351,0.003862,0.015604,0.000869,0.003059,26,0.0,0.0
http://weedmaps.com/deliveries/bad-boy-buds-2,101,52,31,48,9,15,3406,12,0,0.015267,0.009102,...,0.002642,0.004404,0.015803,0.004346,0.010339,0.001018,0.001856,62,0.193548,0.0
http://weedmaps.com/deliveries/blue-wings-delivery,101,50,23,31,3,4,3461,0,0,0.014447,0.006645,...,0.000867,0.001156,0.015963,0.007463,0.008978,0.001047,0.001334,43,0.0,0.0
http://weedmaps.com/deliveries/blue-wings-delivery,127,64,19,94,9,4,4164,0,0,0.01537,0.004563,...,0.002161,0.000961,0.015915,0.004626,0.023034,0.002174,0.000966,68,0.0,0.0


In [12]:
# final.to_csv("data/new_dictionary_method_output.csv")

# pipeline_output = pd.read_csv('../team2_f20_wa/Processed_Data/pipeline_final_output.csv')
# pipeline_output['wmsite'] = pipeline_output['wmsite'].astype(str).str.lower()
# pipeline_output['scrape'] = pipeline_output['scrape'].fillna(-999).astype(int)
# final['scrape'] = final['scrape'].astype(int)
# pipeline_output = pd.merge(pipeline_output, final, left_on=['wmsite', 'scrape'], right_on=['wmsite', 'scrape'], how='left')
# # pipeline_output = pipeline_output.join(final, on=['wmsite', 'scrape'], how='left')

# pipeline_output.to_csv('../team2_f20_wa/Processed_Data/new_pipeline_final_output_with_logits_dictionary_method.csv', index=False)

### Remove stop words and do stemming

In [13]:
dictionary = pd.read_excel('data/dictionary.xlsx')
dictionary['word'] = dictionary['word'].astype(str).str.lower()

p = PorterStemmer()

# dict_words_by_label = dictionary.groupby('dictionary_label')['word'].apply(list)
# dict_words_by_label

In [14]:
dictionary['word'] = dictionary['word'].apply(p.stem)

In [15]:
dictionary['word']

0            high
1            fire
2           crack
3            tree
4        euphoria
          ...    
339    sleepinduc
340       cannabi
341     marijuana
342           pot
343          weed
Name: word, Length: 344, dtype: object

In [16]:
full_dataset = pd.read_csv('data/full_dataset.csv')
full_dataset['wmsite'] = full_dataset['wmsite'].astype(str).str.lower()
full_dataset['scrape'] = full_dataset['scrape'].astype(str).str.lower()
full_dataset.head()

  full_dataset = pd.read_csv('data/full_dataset.csv')


Unnamed: 0,v1,address,city,description,email,price_ounce,product_id,published,scrape_number,slug,...,thc,test_expires,has_photo,photo_filename,pageviews,ratecnt,dateupdated,delivery,rec,v39
0,2651.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.9%-24.9% THC Shake.,recreational365@yahoo.com,65.0,31511407,True,448,365-recreational-cannabis,...,,,,,,,,,,
1,2652.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.4%-24.4% THC,recreational365@yahoo.com,0.0,31457256,True,448,365-recreational-cannabis,...,,,,,,,,,,
2,2653.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,30833338,True,448,365-recreational-cannabis,...,,,,,,,,,,
3,2654.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,14.0%-18.0% THC,recreational365@yahoo.com,210.0,31453769,True,448,365-recreational-cannabis,...,,,,,,,,,,
4,2655.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,28199355,True,448,365-recreational-cannabis,...,,,,,,,,,,


In [17]:
def count_word_frequency_using_set(sentence, dictionary):
    words = sentence.split()  # Split the string into a list of words
    freq_sum = 0

    for word in dictionary:

        # Count the occurrences of the word in the original sentence
        freq_sum += words.count(word)
    
    return freq_sum

def count_total_word(sentence):
    words = sentence.split()
    return len(words)

In [18]:
full_dataset['straindescription'] = '"' + full_dataset['strain'].astype(str) + '" -- '+ full_dataset['description'].astype(str)
full_dataset['straindescription'] = full_dataset['straindescription'].astype(str).str.lower().apply(strip_multiple_whitespaces)
full_dataset['straindescription'] = full_dataset['straindescription'].apply(remove_stopwords)
full_dataset['straindescription'] = full_dataset['straindescription'].apply(p.stem_sentence)

full_dataset['intx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['intx_dict'])
full_dataset['medx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['medx_dict'])
full_dataset['wellx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['wellx_dict'])
full_dataset['bad_cann_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['bad_cann_dict'])
full_dataset['good_cann_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['good_cann_dict'])
full_dataset['total_word_count'] = full_dataset['straindescription'].apply(count_total_word)

In [19]:
full_dataset['intx_dict_count_prop'] = full_dataset['intx_dict_count'] / full_dataset['total_word_count']
full_dataset['medx_dict_count_prop'] = full_dataset['medx_dict_count'] / full_dataset['total_word_count']
full_dataset['wellx_dict_count_prop'] = full_dataset['wellx_dict_count'] / full_dataset['total_word_count']
full_dataset['bad_cann_dict_count_prop'] = full_dataset['bad_cann_dict_count'] / full_dataset['total_word_count']
full_dataset['good_cann_dict_count_prop'] = full_dataset['good_cann_dict_count'] / full_dataset['total_word_count']

In [20]:
full_dataset['is_commoditization'] = ((full_dataset['total_word_count'] < 15) & 
                                      (full_dataset['intx_dict_count'] == 0) & 
                                      (full_dataset['medx_dict_count'] == 0) &
                                      (full_dataset['wellx_dict_count'] == 0) &
                                    #   (full_dataset['bad_cann_dict_count'] == 0) &
                                    #   (full_dataset['good_cann_dict_count'] == 0) & 
                                      full_dataset['straindescription'].str.contains(r"(thc|cbd|%)"))
                                    #   full_dataset['straindescription'].str.contains(r"\d+")) 

full_dataset['is_empty'] = ((full_dataset['total_word_count'] < 5) &
                            (full_dataset['intx_dict_count'] == 0) &
                            (full_dataset['medx_dict_count'] == 0) &
                            (full_dataset['wellx_dict_count'] == 0) &
                            (~ full_dataset['straindescription'].str.contains(r"(thc|cbd|%)")))

  full_dataset['straindescription'].str.contains(r"(thc|cbd|%)"))
  (~ full_dataset['straindescription'].str.contains(r"(thc|cbd|%)")))


In [21]:
full_dataset.head()

Unnamed: 0,v1,address,city,description,email,price_ounce,product_id,published,scrape_number,slug,...,bad_cann_dict_count,good_cann_dict_count,total_word_count,intx_dict_count_prop,medx_dict_count_prop,wellx_dict_count_prop,bad_cann_dict_count_prop,good_cann_dict_count_prop,is_commoditization,is_empty
0,2651.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.9%-24.9% THC Shake.,recreational365@yahoo.com,65.0,31511407,True,448,365-recreational-cannabis,...,0,0,7,0.0,0.0,0.0,0.0,0.0,True,False
1,2652.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,20.4%-24.4% THC,recreational365@yahoo.com,0.0,31457256,True,448,365-recreational-cannabis,...,0,0,7,0.0,0.0,0.0,0.0,0.0,True,False
2,2653.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,30833338,True,448,365-recreational-cannabis,...,0,0,7,0.0,0.0,0.0,0.0,0.0,True,False
3,2654.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,14.0%-18.0% THC,recreational365@yahoo.com,210.0,31453769,True,448,365-recreational-cannabis,...,0,0,6,0.0,0.0,0.0,0.0,0.0,True,False
4,2655.0,"17517 15th Ave NE Unit B,Shoreline,Washington ...",Shoreline,18.0%-22.0% THC,recreational365@yahoo.com,0.0,28199355,True,448,365-recreational-cannabis,...,0,0,8,0.0,0.0,0.0,0.0,0.0,True,False


In [22]:
label_obs = full_dataset.groupby(['wmsite', 'scrape'])['straindescription'].count()

label_total = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count', 'medx_dict_count', 'wellx_dict_count', 
                                                          'bad_cann_dict_count', 'good_cann_dict_count', 'total_word_count', 
                                                          'is_commoditization', 'is_empty']].sum()
label_total.columns = list(map(lambda x: x + "_total_st", label_total.columns))
label_total['intx_dict_count_total_prop_st'] = label_total['intx_dict_count_total_st'] / label_total['total_word_count_total_st']
label_total['medx_dict_count_total_prop_st'] = label_total['medx_dict_count_total_st'] / label_total['total_word_count_total_st']
label_total['wellx_dict_count_total_prop_st'] = label_total['wellx_dict_count_total_st'] / label_total['total_word_count_total_st']
label_total['bad_cann_dict_count_total_prop_st'] = label_total['bad_cann_dict_count_total_st'] / label_total['total_word_count_total_st']
label_total['good_cann_dict_count_total_prop_st'] = label_total['good_cann_dict_count_total_st'] / label_total['total_word_count_total_st']

label_avg = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count_prop', 'medx_dict_count_prop', 'wellx_dict_count_prop', 
                                                        'bad_cann_dict_count_prop', 'good_cann_dict_count_prop']].mean()
label_avg.columns = list(map(lambda x: x + "_avg_st", label_avg.columns))

In [23]:
final_st = label_total.join(label_avg).join(label_obs)
final_st = final_st.rename(columns={'straindescription': 'num_obs_total_st'})
final_st['is_commoditization_prop_st'] = final_st['is_commoditization_total_st'] / final_st['num_obs_total_st']
final_st['is_empty_prop_st'] = final_st['is_empty_total_st'] / final_st['num_obs_total_st']
# final_st = final_st.reset_index()
final_st.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,intx_dict_count_total_st,medx_dict_count_total_st,wellx_dict_count_total_st,bad_cann_dict_count_total_st,good_cann_dict_count_total_st,total_word_count_total_st,is_commoditization_total_st,is_empty_total_st,intx_dict_count_total_prop_st,medx_dict_count_total_prop_st,...,bad_cann_dict_count_total_prop_st,good_cann_dict_count_total_prop_st,intx_dict_count_prop_avg_st,medx_dict_count_prop_avg_st,wellx_dict_count_prop_avg_st,bad_cann_dict_count_prop_avg_st,good_cann_dict_count_prop_avg_st,num_obs_total_st,is_commoditization_prop_st,is_empty_prop_st
wmsite,scrape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
http://weedmaps.com/deliveries/all-time-high,101,14,4,11,1,0,698,0,0,0.020057,0.005731,...,0.001433,0.0,0.020138,0.004437,0.013165,0.001292,0.0,18,0.0,0.0
http://weedmaps.com/deliveries/all-time-high,127,17,5,14,2,0,962,0,0,0.017672,0.005198,...,0.002079,0.0,0.018867,0.003798,0.011291,0.00162,0.0,26,0.0,0.0
http://weedmaps.com/deliveries/bad-boy-buds-2,101,46,17,22,9,0,2234,16,0,0.020591,0.00761,...,0.004029,0.0,0.018126,0.003577,0.004172,0.001666,0.0,62,0.258065,0.0
http://weedmaps.com/deliveries/blue-wings-delivery,101,25,10,11,3,0,2179,0,0,0.011473,0.004589,...,0.001377,0.0,0.012558,0.004702,0.00563,0.00182,0.0,43,0.0,0.0
http://weedmaps.com/deliveries/blue-wings-delivery,127,42,7,41,9,0,2786,0,0,0.015075,0.002513,...,0.00323,0.0,0.015434,0.002889,0.0152,0.003696,0.0,68,0.0,0.0


#### Join two methods

In [24]:
two_methods = final.join(final_st).reset_index()
two_methods.head()

Unnamed: 0,wmsite,scrape,intx_dict_count_total,medx_dict_count_total,wellx_dict_count_total,bad_cann_dict_count_total,good_cann_dict_count_total,total_word_count_total,is_commoditization_total,is_empty_total,...,bad_cann_dict_count_total_prop_st,good_cann_dict_count_total_prop_st,intx_dict_count_prop_avg_st,medx_dict_count_prop_avg_st,wellx_dict_count_prop_avg_st,bad_cann_dict_count_prop_avg_st,good_cann_dict_count_prop_avg_st,num_obs_total_st,is_commoditization_prop_st,is_empty_prop_st
0,http://weedmaps.com/deliveries/all-time-high,101,14,7,25,1,4,1162,0,0,...,0.001433,0.0,0.020138,0.004437,0.013165,0.001292,0.0,18,0.0,0.0
1,http://weedmaps.com/deliveries/all-time-high,127,17,8,29,2,6,1588,0,0,...,0.002079,0.0,0.018867,0.003798,0.011291,0.00162,0.0,26,0.0,0.0
2,http://weedmaps.com/deliveries/bad-boy-buds-2,101,52,31,48,9,15,3406,12,0,...,0.004029,0.0,0.018126,0.003577,0.004172,0.001666,0.0,62,0.258065,0.0
3,http://weedmaps.com/deliveries/blue-wings-deli...,101,50,23,31,3,4,3461,0,0,...,0.001377,0.0,0.012558,0.004702,0.00563,0.00182,0.0,43,0.0,0.0
4,http://weedmaps.com/deliveries/blue-wings-deli...,127,64,19,94,9,4,4164,0,0,...,0.00323,0.0,0.015434,0.002889,0.0152,0.003696,0.0,68,0.0,0.0


In [25]:
two_methods.to_csv("data/dictionary_method_output_st.csv")

In [26]:
pipeline_output = pd.read_csv('../team2_f20_wa/Processed_Data/pipeline_final_output.csv')
pipeline_output['wmsite'] = pipeline_output['wmsite'].astype(str).str.lower()
pipeline_output['scrape'] = pipeline_output['scrape'].fillna(-999).astype(int)
two_methods['scrape'] = two_methods['scrape'].astype(int)
pipeline_output = pd.merge(pipeline_output, two_methods, left_on=['wmsite', 'scrape'], right_on=['wmsite', 'scrape'], how='left')
# pipeline_output = pipeline_output.join(final, on=['wmsite', 'scrape'], how='left')


  pipeline_output = pd.read_csv('../team2_f20_wa/Processed_Data/pipeline_final_output.csv')


In [27]:
pipeline_output.to_csv('data/pipeline_final_output_with_dictionary_method.csv', index=False)

## Applying to dispensary name

### Keep stop words and no stemming

In [28]:
dictionary = pd.read_excel('data/dictionary.xlsx')
dictionary['word'] = dictionary['word'].astype(str).str.lower()
dict_words_by_label = dictionary.groupby('dictionary_label')['word'].apply(list)

full_dataset = pd.read_csv('data/full_dataset.csv')
full_dataset['wmsite'] = full_dataset['wmsite'].astype(str).str.lower()
full_dataset['scrape'] = full_dataset['scrape'].astype(str).str.lower()
full_dataset.head()

def count_word_frequency_using_set(sentence, dictionary):
    if sentence == "":
        return 0
    words = sentence.split("-")  # Split the string into a list of words
    freq_sum = 0

    for word in dictionary:

        # Count the occurrences of the word in the original sentence
        freq_sum += words.count(word)
    
    return freq_sum

def count_total_word(sentence):
    if sentence == "":
        return 0
    words = sentence.split("-")
    return len(words)

  full_dataset = pd.read_csv('data/full_dataset.csv')


In [29]:
full_dataset['slug'] = full_dataset['slug'].fillna("").astype(str).str.lower().apply(strip_multiple_whitespaces)
full_dataset['intx_dict_count'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['intx_dict'])
full_dataset['medx_dict_count'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['medx_dict'])
full_dataset['wellx_dict_count'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['wellx_dict'])
full_dataset['bad_cann_dict_count'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['bad_cann_dict'])
full_dataset['good_cann_dict_count'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['good_cann_dict'])
full_dataset['total_word_count'] = full_dataset['slug'].apply(count_total_word)

full_dataset['intx_dict_count_prop'] = full_dataset['intx_dict_count'] / full_dataset['total_word_count']
full_dataset['medx_dict_count_prop'] = full_dataset['medx_dict_count'] / full_dataset['total_word_count']
full_dataset['wellx_dict_count_prop'] = full_dataset['wellx_dict_count'] / full_dataset['total_word_count']
full_dataset['bad_cann_dict_count_prop'] = full_dataset['bad_cann_dict_count'] / full_dataset['total_word_count']
full_dataset['good_cann_dict_count_prop'] = full_dataset['good_cann_dict_count'] / full_dataset['total_word_count']

full_dataset['is_commoditization'] = ((full_dataset['total_word_count'] < 15) & 
                                      (full_dataset['intx_dict_count'] == 0) & 
                                      (full_dataset['medx_dict_count'] == 0) &
                                      (full_dataset['wellx_dict_count'] == 0) &
                                    #   (full_dataset['bad_cann_dict_count'] == 0) &
                                    #   (full_dataset['good_cann_dict_count'] == 0) & 
                                      full_dataset['slug'].str.contains(r"(thc|cbd|%)"))
                                    #   full_dataset['straindescription'].str.contains(r"\d+")) 

full_dataset['is_empty'] = ((full_dataset['total_word_count'] < 5) &
                            (full_dataset['intx_dict_count'] == 0) &
                            (full_dataset['medx_dict_count'] == 0) &
                            (full_dataset['wellx_dict_count'] == 0) &
                            (~ full_dataset['slug'].str.contains(r"(thc|cbd|%)")))

label_obs = full_dataset.groupby(['wmsite', 'scrape'])['slug'].count()

label_group = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count_prop', 'medx_dict_count_prop', 'wellx_dict_count_prop', 
                                                          'bad_cann_dict_count_prop', 'good_cann_dict_count_prop', 'total_word_count', 
                                                          'is_commoditization', 'is_empty']].first()

  full_dataset['slug'].str.contains(r"(thc|cbd|%)"))
  (~ full_dataset['slug'].str.contains(r"(thc|cbd|%)")))


In [30]:
final = label_group.join(label_obs)
final = final.rename(columns={'slug': 'num_obs_total'})
final

Unnamed: 0_level_0,Unnamed: 1_level_0,intx_dict_count_prop,medx_dict_count_prop,wellx_dict_count_prop,bad_cann_dict_count_prop,good_cann_dict_count_prop,total_word_count,is_commoditization,is_empty,num_obs_total
wmsite,scrape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
http://weedmaps.com/deliveries/all-time-high,101,,,,,,0,False,True,18
http://weedmaps.com/deliveries/all-time-high,127,,,,,,0,False,True,26
http://weedmaps.com/deliveries/bad-boy-buds-2,101,,,,,,0,False,True,62
http://weedmaps.com/deliveries/blue-wings-delivery,101,,,,,,0,False,True,43
http://weedmaps.com/deliveries/blue-wings-delivery,127,,,,,,0,False,True,68
...,...,...,...,...,...,...,...,...,...,...
https://weedmaps.com/dispensaries/xanders-tacoma,215,,,,,,0,False,True,46
https://weedmaps.com/dispensaries/yakima-weed-company,215,,,,,,0,False,True,413
https://weedmaps.com/dispensaries/yakima-weed-company,227,0.333333,0.0,0.0,0.333333,0.0,3,False,False,361
https://weedmaps.com/dispensaries/yakima-weed-company,242,0.333333,0.0,0.0,0.333333,0.0,3,False,False,379


### Remove stop words and do stemming

In [31]:
dictionary = pd.read_excel('data/dictionary.xlsx')
dictionary['word'] = dictionary['word'].astype(str).str.lower()

p = PorterStemmer()

dictionary['word'] = dictionary['word'].apply(p.stem)

full_dataset = pd.read_csv('data/full_dataset.csv')
full_dataset['wmsite'] = full_dataset['wmsite'].astype(str).str.lower()
full_dataset['scrape'] = full_dataset['scrape'].astype(str).str.lower()

  full_dataset = pd.read_csv('data/full_dataset.csv')


In [32]:
def count_word_frequency_using_set(sentence, dictionary):
    if sentence == "":
        return 0
    words = sentence.split("-")  # Split the string into a list of words
    freq_sum = 0

    for word in dictionary:

        # Count the occurrences of the word in the original sentence
        freq_sum += words.count(word)
    
    return freq_sum

def count_total_word(sentence):
    if sentence == "":
        return 0
    words = sentence.split("-")
    return len(words)

In [33]:
full_dataset['slug'] = full_dataset['slug'].fillna("").astype(str).str.lower().apply(strip_multiple_whitespaces)
full_dataset['slug'] = full_dataset['slug'].apply(remove_stopwords).apply(p.stem_sentence)
full_dataset['intx_dict_count_st'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['intx_dict'])
full_dataset['medx_dict_count_st'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['medx_dict'])
full_dataset['wellx_dict_count_st'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['wellx_dict'])
full_dataset['bad_cann_dict_count_st'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['bad_cann_dict'])
full_dataset['good_cann_dict_count_st'] = full_dataset['slug'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['good_cann_dict'])
full_dataset['total_word_count_st'] = full_dataset['slug'].apply(count_total_word)

full_dataset['intx_dict_count_prop_st'] = full_dataset['intx_dict_count_st'] / full_dataset['total_word_count_st']
full_dataset['medx_dict_count_prop_st'] = full_dataset['medx_dict_count_st'] / full_dataset['total_word_count_st']
full_dataset['wellx_dict_count_prop_st'] = full_dataset['wellx_dict_count_st'] / full_dataset['total_word_count_st']
full_dataset['bad_cann_dict_count_prop_st'] = full_dataset['bad_cann_dict_count_st'] / full_dataset['total_word_count_st']
full_dataset['good_cann_dict_count_prop_st'] = full_dataset['good_cann_dict_count_st'] / full_dataset['total_word_count_st']

full_dataset['is_commoditization_st'] = ((full_dataset['total_word_count_st'] < 15) & 
                                      (full_dataset['intx_dict_count_st'] == 0) & 
                                      (full_dataset['medx_dict_count_st'] == 0) &
                                      (full_dataset['wellx_dict_count_st'] == 0) &
                                    #   (full_dataset['bad_cann_dict_count'] == 0) &
                                    #   (full_dataset['good_cann_dict_count'] == 0) & 
                                      full_dataset['slug'].str.contains(r"(thc|cbd|%)"))
                                    #   full_dataset['straindescription'].str.contains(r"\d+")) 

full_dataset['is_empty_st'] = ((full_dataset['total_word_count_st'] < 5) &
                            (full_dataset['intx_dict_count_st'] == 0) &
                            (full_dataset['medx_dict_count_st'] == 0) &
                            (full_dataset['wellx_dict_count_st'] == 0) &
                            (~ full_dataset['slug'].str.contains(r"(thc|cbd|%)")))

label_obs = full_dataset.groupby(['wmsite', 'scrape'])['slug'].count()

label_group = full_dataset.groupby(['wmsite', 'scrape'])[['intx_dict_count_prop_st', 'medx_dict_count_prop_st', 'wellx_dict_count_prop_st', 
                                                          'bad_cann_dict_count_prop_st', 'good_cann_dict_count_prop_st', 'total_word_count_st', 
                                                          'is_commoditization_st', 'is_empty_st']].first()

final_st = label_group.join(label_obs)
final_st = final_st.rename(columns={'slug': 'num_obs_total_st'})
final_st

  full_dataset['slug'].str.contains(r"(thc|cbd|%)"))
  (~ full_dataset['slug'].str.contains(r"(thc|cbd|%)")))


Unnamed: 0_level_0,Unnamed: 1_level_0,intx_dict_count_prop_st,medx_dict_count_prop_st,wellx_dict_count_prop_st,bad_cann_dict_count_prop_st,good_cann_dict_count_prop_st,total_word_count_st,is_commoditization_st,is_empty_st,num_obs_total_st
wmsite,scrape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
http://weedmaps.com/deliveries/all-time-high,101,,,,,,0,False,True,18
http://weedmaps.com/deliveries/all-time-high,127,,,,,,0,False,True,26
http://weedmaps.com/deliveries/bad-boy-buds-2,101,,,,,,0,False,True,62
http://weedmaps.com/deliveries/blue-wings-delivery,101,,,,,,0,False,True,43
http://weedmaps.com/deliveries/blue-wings-delivery,127,,,,,,0,False,True,68
...,...,...,...,...,...,...,...,...,...,...
https://weedmaps.com/dispensaries/xanders-tacoma,215,,,,,,0,False,True,46
https://weedmaps.com/dispensaries/yakima-weed-company,215,,,,,,0,False,True,413
https://weedmaps.com/dispensaries/yakima-weed-company,227,0.333333,0.0,0.0,0.333333,0.0,3,False,False,361
https://weedmaps.com/dispensaries/yakima-weed-company,242,0.333333,0.0,0.0,0.333333,0.0,3,False,False,379


#### Join two methods

In [34]:
two_methods = final.join(final_st).reset_index()
two_methods.to_csv("data/dispensary_name_dictionary_method_output_st.csv")
pipeline_output = pd.read_csv('../team2_f20_wa/Processed_Data/pipeline_final_output.csv')
pipeline_output['wmsite'] = pipeline_output['wmsite'].astype(str).str.lower()
pipeline_output['scrape'] = pipeline_output['scrape'].fillna(-999).astype(int)
two_methods['scrape'] = two_methods['scrape'].astype(int)
pipeline_output = pd.merge(pipeline_output, two_methods, left_on=['wmsite', 'scrape'], right_on=['wmsite', 'scrape'], how='left')
pipeline_output.to_csv('data/dispensary_name_pipeline_final_output_with_dictionary_method.csv', index=False)

  pipeline_output = pd.read_csv('../team2_f20_wa/Processed_Data/pipeline_final_output.csv')


: 

## Application to create alternative specification

In [2]:
import pandas as pd
import numpy as np

from gensim.parsing import remove_stopwords, strip_numeric, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.porter import PorterStemmer

dictionary = pd.read_excel('data/dictionary.xlsx')
dictionary['word'] = dictionary['word'].astype(str).str.lower()
dict_words_by_label = dictionary.groupby('dictionary_label')['word'].apply(list)
dict_words_by_label

full_dataset = pd.read_csv('data/full_dataset_with_labels.csv')

  data = pd.read_csv('data/full_dataset_with_labels.csv')


In [3]:
def count_word_frequency_using_set(sentence, dictionary, sep=" "):
    words = sentence.split(sep)  # Split the string into a list of words
    freq_sum = 0

    for word in dictionary:

        # Count the occurrences of the word in the original sentence
        freq_sum += words.count(word)
    
    return freq_sum

def count_total_word(sentence):
    words = sentence.split()
    return len(words)

In [None]:
full_dataset['straindescription'] = '"' + full_dataset['strain'].astype(str) + '" -- '+ full_dataset['description'].astype(str)
full_dataset['straindescription'] = full_dataset['straindescription'].astype(str).str.lower().apply(strip_multiple_whitespaces)
full_dataset['intx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['intx_dict'])
full_dataset['medx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['medx_dict'])
full_dataset['wellx_dict_count'] = full_dataset['straindescription'].apply(count_word_frequency_using_set, dictionary=dict_words_by_label['wellx_dict'])
full_dataset['total_word_count'] = full_dataset['straindescription'].apply(count_total_word)

full_dataset['is_empty'] = ((full_dataset['total_word_count'] < 5) &
                            (full_dataset['intx_dict_count'] == 0) &
                            (full_dataset['medx_dict_count'] == 0) &
                            (full_dataset['wellx_dict_count'] == 0) &
                            (~ full_dataset['straindescription'].str.contains(r"(thc|cbd|%)")))