### Importing Necessary Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

import time
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
#setting options to display entire dataframes

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

### Retrieving Data

I need to scrape the www.leafly.com/explore/sort-alpha page to build 
the urls for each strain's webpage on leafly.com. After some exploration,
I discovered that www.leafly.com/explore/sort-alpha was masking its 
page numbers. Realizing this made it easier to build all of the 
strain's urls.

1) Scrape for strain names and build url df - 

In [None]:
#A quick look at https://www.leafly.com/explore/page-61/sort-alpha shows 
#that page-61 is the last page of strains

pages = 61 
strain_names = []
websites = []

for i in range(pages+1):
    page_url = "https://www.leafly.com/explore/page-"+str(i)+"/sort-alpha"
    html = requests.get(page_url).text
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.findAll("a",{"class":"ga_Explore_Strain_Tile"})
        
    for url in urls:
        strain_name = url.get('href')
        strain_names.append(strain_name)
        website = ("https://www.leafly.com" + strain_name)
        websites.append(website)

        
#create strain_names csv
strains_df = pd.DataFrame(strain_names, columns = ['strain_name'])

#create strain_names csv
strains_df.to_csv('strains.csv')
        
#create url df
url_df = pd.DataFrame(websites, columns = ['url'])

#create url csv
url_df.to_csv('urls.csv')

I need to start to build out a master dataframe with the strain name,
type, attributes and flavors. Here I start to with extracting the 
strain name and strain type from the html scrape. I frequently saved 
my dataframes to csvs since there is always a chance of losing 
your data and I did not want to have to repeatedly scrape. 

In [None]:
#parse strain_name:
parsed =[]
for words in strains_df['strain_name']:
    parse = re.findall(r"[\w']+", words)
    parsed.append(parse)

In [None]:
#create name and strain type df
strain_type_df = pd.DataFrame(parsed, columns = ['strain', 'name1', 'name2',
                                    'name3', 'name4', 'name5', 'name6'])

In [None]:
#drop last 5 columns - just fillers
strain_type_df.drop(df.tail(5).index,inplace=True)

In [None]:
#merge name columns
strain_type_df['name'] = strain_type_df[strain_type_df.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1)

In [None]:
#drop unnecessary columns
strain_type_df.drop(['name1', 'name2', 'name3', 'name4', 'name5', 'name6'], axis=1, inplace=True)

In [None]:
#save df to csv
strain_type_df.to_csv('strain_type.csv')

In [None]:
#number of duplicates
df['name'].duplicated().sum()

In [None]:
url_df['url'].duplicated().sum()

In [None]:
url_df.drop_duplicates(inplace=True)

In [None]:
url_df.reset_index()

In [None]:
len(url_df)

2) Scrape each strain url - 

In [None]:
#scraping for html code of all of the leafly strains
urls_total = urls_df['url']     

#scrape individual strain pages
soups_total = []

for url in urls_total:
    html = requests.get(url).text
    soup = BeautifulSoup(html,'html.parser')
    soups_total.append(soup)
    time.sleep(1)

### Get Features From Tags

In addition to the strain name and strain type, my final dataframe 
will consist of the positive effects, medical purposes, negative 
effects and flavors. I also scraped for parental lineage but did not 
end up using this data.

In [None]:
#collect positive effects

effect = []
filename1 = "effect.csv"

for i in tqdm(range(0, len(soups_total))): 
    names = []
    cont = soups_total[i].find("a",{"class":"active"})  
    names.append(cont)
    try:
        for name in names:
            strain_name = name.get('href') 
            strain = strain_name.split('/')
        div = soups_total[i].find('div', {'id': 'effects-tab-content'})
        first_child = div.findChildren('div', {'class': "histogram-label"})
        try:
            if len(first_child) == 5:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                    first_child[2].text, first_child[3].text, 
                        first_child[4].text]
                par = {key:values}
            
            elif len(first_child) == 4:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text, first_child[3].text]
                par = {key:values}
            
            elif len(first_child) == 3:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text]
                par = {key:values}
            
            elif len(first_child) == 2:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text]
                par = {key:values}
            
            elif len(first_child) == 1:
                key = strain[2]
                values = [first_child[0].text]
                par = {key:values}
            
            else:
                key = strain[2]
                values = None
                par = {key:values}       
        except:
            key = strain[2]
            values = None
            par = {key:values}
    except:
        key = strain[2]
        values = None
        par = {key:values}
    
        
    effect.append(par)

#create df and save to csv
df_effect = pd.DataFrame(effect)
df_effect.to_csv(filename1)

#collect medical attributes

med = []
filename2 = "med.csv"

for i in tqdm(range(0, len(soups_total))): 
    names = []
    cont = soups_total[i].find("a",{"class":"active"})  
    names.append(cont)
    try:
        for name in names:
            strain_name = name.get('href') 
            strain = strain_name.split('/')
        div = soups_total[i].find('div', {'id': 'medical-tab-content'})
        first_child = div.findChildren('div', {'class': "histogram-label"})
        try:
            if len(first_child) == 5:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                    first_child[2].text, first_child[3].text, 
                        first_child[4].text]
                par = {key:values}
            
            elif len(first_child) == 4:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text, first_child[3].text]
                par = {key:values}
            
            elif len(first_child) == 3:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text]
                par = {key:values}
            
            elif len(first_child) == 2:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text]
                par = {key:values}
            
            elif len(first_child) == 1:
                key = strain[2]
                values = [first_child[0].text]
                par = {key:values}
            
            else:
                key = strain[2]
                values = None
                par = {key:values}       
        except:
            key = strain[2]
            values = None
            par = {key:values}
    except:
        key = strain[2]
        values = None
        par = {key:values}
    
        
    med.append(par)

#create df and save to csv
df_med = pd.DataFrame(med)
df_med.to_csv(filename2)

#collect negative attributes

neg = []
filename3 = "neg.csv"

for i in tqdm(range(0, len(soups_total))): 
    names = []
    cont = soups_total[i].find("a",{"class":"active"})  
    names.append(cont)
    try:
        for name in names:
            strain_name = name.get('href') 
            strain = strain_name.split('/')
        div = soups_total[i].find('div', {'id': 'negatives-tab-content'})
        first_child = div.findChildren('div', {'class': "histogram-label"})
        try:
            if len(first_child) == 5:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                    first_child[2].text, first_child[3].text, 
                        first_child[4].text]
                par = {key:values}
            
            elif len(first_child) == 4:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text, first_child[3].text]
                par = {key:values}
            
            elif len(first_child) == 3:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                          first_child[2].text]
                par = {key:values}
            
            elif len(first_child) == 2:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text]
                par = {key:values}
            
            elif len(first_child) == 1:
                key = strain[2]
                values = [first_child[0].text]
                par = {key:values}
            
            else:
                key = strain[2]
                values = None
                par = {key:values}       
        except:
            key = strain[2]
            values = None
            par = {key:values}
    except:
        key = strain[2]
        values = None
        par = {key:values}
    
        
    neg.append(par)

#create df and save to csv
df_neg = pd.DataFrame(neg)
df_neg.to_csv(filename3)

#collect flavors

flavors = []
filename4 = "flavors.csv"

for i in tqdm(range(0, len(soups_total))): 
    names = []
    cont = soups_total[i].find("a",{"class":"active"})  
    names.append(cont)
    try:
        for name in names:
            strain_name = name.get('href')
            strain = strain_name.split('/')
        first_child = soups_total[i].findAll('div',attrs={"class" : "flavor-name"})
        try:
            if len(first_child) == 3:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text, 
                        first_child[2].text]
                par = {key:values}
            elif len(first_child) == 2:
                key = strain[2]
                values = [first_child[0].text, first_child[1].text]
                par = {key:values}
            elif len(first_child) == 1:
                key = strain[2]
                values = [first_child[0].text]
                par = {key:values}
            else:
                key = strain[2]
                values = None
                par = {key:values}       
        except:
            key = strain[2]
            values = None
            par = {key:values}
    except:
        key = strain[2]
        values = None
        par = {key:values}
    
        
    flavors.append(par)

#create df and save to csv
df_flavors = pd.DataFrame(flavors)
df_flavors.to_csv(filename4)  

#collect parents of strains

parents = []

filename5 = 'parents.csv'

for i in tqdm(range(0, len(soups_total))): 
    children = soups_total[i].findAll('div',attrs={"class" : "strain-tile-footer"})
    try:
        if len(children) == 3:
            key = children[0].text
            values = [children[1].text, children[2].text]
            par = {key:values}
        elif len(children) == 2:
            key = children[0].text
            values = children[1].text
            par = {key:values}        
        else:
            key = children[0].text
            values = None
            par = {key:values}
    except:
        None
        #print("can't find that page")
    
    parents.append(par)
   
#create df and save to csv
df_parents = pd.DataFrame(parents)
df_parents.to_csv(filename5)

### Clean and Engineer Individual Dataframes
#### Import CSVs

In [None]:
#import categories - hybrid, sativa, indica
#strain_type.csv from above was renamed as 
#strain_category.csv in pages
cat_df = pd.read_csv('strain_category.csv')

#import flavors
flav_df = pd.read_csv('flavors.csv')

#import parents
par_df = pd.read_csv('parents.csv')

#import effects
eff_df = pd.read_csv('effect.csv')

#import medical
med_df = pd.read_csv('med.csv')

#import negative
neg_df = pd.read_csv('neg.csv')

#### Pivot Tables/Table Formatting

In [None]:
#create function drop unnecessary first column
def drop_column(df):
    df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
#drop first column on all dfs
drop_column(flav_df)
drop_column(par_df)
drop_column(eff_df)
drop_column(med_df)
drop_column(neg_df)
drop_column(cat_df)

In [None]:
#pivot tables so that column names (strains) are now indices
flav_series = flav_df.stack()
par_series = par_df.stack()
eff_series = eff_df.stack()
med_series = med_df.stack()
neg_series = neg_df.stack()

#convert from series to dataframe
flav_df = flav_series.to_frame(name='flavor')
par_df = par_series.to_frame(name='parent')
eff_df = eff_series.to_frame(name='effect')
med_df = med_series.to_frame(name='medical')
neg_df = neg_series.to_frame(name='negative')

#### Cleaning Data

In [None]:
# create function to reset index 
def reset_index(df):
    df.reset_index(inplace=True)
    df.drop('level_0', axis=1, inplace=True)
    df.rename(columns={'level_1':'strain'}, inplace=True)

In [None]:
#reset index on all dfs
reset_index(flav_df)
reset_index(par_df)
reset_index(eff_df)
reset_index(med_df)
reset_index(neg_df)

In [None]:
#clean/standardize all dfs 

#cat_df.reset_index('name', inplace=True)
cat_df.rename(columns={'strain':'category'}, inplace=True)
cat_df.rename(columns={'name':'strain'}, inplace=True)

#remove $ from parents table
par_df['strain'] = [x.strip('$') for x in par_df.strain]

#make all entries lowercase
flav_df = flav_df.apply(lambda x: x.astype(str).str.lower())
par_df = par_df.apply(lambda x: x.astype(str).str.lower())
eff_df = eff_df.apply(lambda x: x.astype(str).str.lower())
med_df = med_df.apply(lambda x: x.astype(str).str.lower())
neg_df = neg_df.apply(lambda x: x.astype(str).str.lower())
cat_df = cat_df.apply(lambda x: x.astype(str).str.lower())

#replace - with space in all tables
flav_df['strain'] = flav_df['strain'].str.replace('-',' ')
par_df['strain'] = par_df['strain'].str.replace('-',' ')
eff_df['strain'] = eff_df['strain'].str.replace('-',' ')
med_df['strain'] = med_df['strain'].str.replace('-',' ')
neg_df['strain'] = neg_df['strain'].str.replace('-',' ')
cat_df['strain'] = cat_df['strain'].str.replace('-',' ')

# Functions
# #make all data lowercase
# def lowercase(df):
#     df = df.apply(lambda x: x.astype(str).str.lower())
#     return
    
# #replace dash with space in all tables
# def replace_dash(df, column):
#     df[column] = df[column].str.replace('-',' ')
#     return

In [None]:
#more cleaning/standardizing all dfs 

#function to remove [,], and ' characters
def remove_char(df, column):
    df[column] = df[column].str.replace('[',' ')
    df[column] = df[column].str.replace(']',' ')
    df[column] = df[column].str.replace("'",'')
    
#function to split positive, medical and negative attributes
#from being aggregated in 1 column to each their own column
def new_columns_5(df, column):    
    # new data frame with split value columns  
    new1 = df[column].str.split(',', n = 1, expand = True)   
    df[str(column) + '_1'] = new1[0] 
    df[str(column) + '_2'] = new1[1]

    new2 = df[str(column) + '_2'].str.split(',', n = 1, expand = True)  
    df[str(column) + '_2']= new2[0] 
    df[str(column) + '_3']= new2[1] 
  
    new3 = df[str(column) + '_3'].str.split(',', n = 1, expand = True)
    df[str(column) + '_3']= new3[0]
    df[str(column) + '_4']= new3[1]

    new4 = df[str(column) + '_4'].str.split(',', n = 1, expand = True)
    df[str(column) + '_4']= new4[0]
    df[str(column) + '_5']= new4[1]

    #dropping old name columns 
    df.drop(columns =[column], inplace = True) 

#function to split flavor attributes from being aggregated in 1 column 
#to each their own column
def new_columns_3(df, column):    
    # new data frame with split value columns 
    new1 = df[column].str.split(',', n = 1, expand = True)   
    df[str(column) + '_1'] = new1[0] 
    df[str(column) + '_2'] = new1[1]

    new2 = df[str(column) + '_2'].str.split(',', n = 1, expand = True)  
    df[str(column) + '_2']= new2[0] 
    df[str(column) + '_3']= new2[1] 
  
    new3 = df[str(column) + '_3'].str.split(',', n = 1, expand = True)
    df[str(column) + '_3']= new3[0]

    #dropping old name columns 
    df.drop(columns =[column], inplace = True) 
    
#function to split parent from being aggregated in 1 column 
#to each their own column
def new_columns_2(df, column):    
    
    # new data frame with split value columns
    new1 = df[column].str.split(',', n = 1, expand = True)   
    df[str(column) + '_1'] = new1[0] 
    df[str(column) + '_2'] = new1[1]

    #dropping old name columns 
    df.drop(columns =[column], inplace = True)

In [None]:
#applying functions 

remove_char(flav_df, 'flavor')
remove_char(par_df, 'parent')
remove_char(eff_df, 'effect')
remove_char(med_df, 'medical')
remove_char(neg_df, 'negative')
remove_char(cat_df, 'category')

new_columns_5(eff_df, 'effect')
new_columns_5(med_df, 'medical')
new_columns_5(neg_df, 'negative')

new_columns_3(flav_df, 'flavor')

new_columns_2(par_df, 'parent')

In [None]:
#more cleanup
par_df['strain'] = par_df['strain'].str.replace("'",'')

In [None]:
#final cleaned and formatted dfs and csvs

flav_final_df = pd.DataFrame(flav_df)
flav_final_df.to_csv('flav_final_df.csv')

par_final_df = pd.DataFrame(par_df)
par_final_df.to_csv('par_final_df.csv')

eff_final_df = pd.DataFrame(eff_df)
eff_final_df.to_csv('eff_final_df.csv')

med_final_df = pd.DataFrame(med_df)
med_final_df.to_csv('med_final_df.csv')

neg_final_df = pd.DataFrame(neg_df)
neg_final_df.to_csv('neg_final_df.csv')

cat_final_df = pd.DataFrame(cat_df)
cat_final_df.to_csv('cat_final_df.csv')

### Create Final Merged Dataframe From Individual Dataframes

In [None]:
#import final csvs

flav_df = pd.read_csv('flav_final_df.csv')
par_df = pd.read_csv('par_final_df.csv')
eff_df = pd.read_csv('eff_final_df.csv')
med_df = pd.read_csv('med_final_df.csv')
neg_df = pd.read_csv('neg_final_df.csv')
cat_df = pd.read_csv('cat_final_df.csv')

In [None]:
#create master dataframe

#drop unnecessary first column
drop_column(flav_df)
drop_column(par_df)
drop_column(eff_df)
drop_column(med_df)
drop_column(neg_df)
drop_column(cat_df)

#merge df to create master df
final_df = pd.merge(cat_df, eff_df, how='outer')
final_df.drop_duplicates(inplace=True)
final_df = pd.merge(final_df, med_df, how='outer')
final_df.drop_duplicates(inplace=True)
final_df = pd.merge(final_df, neg_df, how='outer')
final_df.drop_duplicates(inplace=True)
final_df = pd.merge(final_df, flav_df, how='outer')
final_df.drop_duplicates(inplace=True)
final_df = pd.merge(final_df, par_df, how='outer')
final_df.drop_duplicates(inplace=True)

#final_df.to_csv('final_df.csv')

### Analysis Using CountVectorizer and Cosine Similarity

Some strains didn't have any values for Effects, Medical and Negative 
Attributes and I chose to remove those strains from the dataframe.
I did this in pages and saved the csv as master_final_copy.

I opted to drop the parent and review data, per my coaches instruction. 

In [3]:
master_df = pd.read_csv('master_final_copy.csv')
#master_df.drop('Unnamed: 0', axis=1, inplace=True)
master_df.fillna('none', inplace=True)

#strip leading numbers from flavors columns
master_df['flavor_1'] = master_df['flavor_1'].str.strip('1. ')
master_df['flavor_2'] = master_df['flavor_2'].str.strip('2. ')
master_df['flavor_3'] = master_df['flavor_3'].str.strip('3. ')


to_drop = ['review_total', 'parent_1', 'parent_2', 
           'parent_3', 'parent_4', 'parent_5', 
           'parent_6', 'parent_7']

master_df.drop(to_drop, axis=1, inplace=True)

In [4]:
def clean(df, column): 
    df[column] = df[column].str.replace('"','')
    df[column] = df[column].str.replace('#','')
    df[column] = df[column].str.replace(' ','')

In [5]:
clean(master_df, 'effect_1')
clean(master_df, 'effect_2')
clean(master_df, 'effect_3')
clean(master_df, 'effect_4')
clean(master_df, 'effect_5')
clean(master_df, 'medical_1')
clean(master_df, 'medical_2')
clean(master_df, 'medical_3')
clean(master_df, 'medical_4')
clean(master_df, 'medical_5')
clean(master_df, 'negative_1')
clean(master_df, 'negative_2')
clean(master_df, 'negative_3')
clean(master_df, 'negative_4')
clean(master_df, 'negative_5')
clean(master_df, 'flavor_1')
clean(master_df, 'flavor_2')
clean(master_df, 'flavor_3')

In [6]:
master_df.duplicated().sum()

24

In [7]:
master_df.drop_duplicates(inplace=True)

In [8]:
#checking for any duplicates
ids = master_df['strain']
master_df[ids.isin(ids[ids.duplicated()])]

Unnamed: 0,category,strain,effect_1,effect_2,effect_3,effect_4,effect_5,medical_1,medical_2,medical_3,medical_4,medical_5,negative_1,negative_2,negative_3,negative_4,negative_5,flavor_1,flavor_2,flavor_3


In [9]:
master_df.reset_index(inplace=True)

In [10]:
master_df.drop('index', axis=1, inplace=True)

In [11]:
len(master_df)

2621

#### Vectorization and Cosine Similarity

I used Natural Language Processing (NLP) with scikit-learn’s CountVectorizer to convert attributes into vectors
and then used scikit-learn’s cosine_similarity to build recommendation engine that takes in a strain and returns top 5 recommended strains.

In [12]:
#move category column to end
df1 = master_df.pop('category')
master_df['category'] = df1

#combine all attributes into their own column to vectorize
master_df['combined'] = master_df[master_df.columns[1:]].apply(lambda x: ', '.join(x), axis=1)

#create final master for recs csv
master_df.to_csv('master_for_recs.csv', index=False)

#instantiating and generating the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(master_df['combined'])

#generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

### Query and Get Recommendations Based on Cosine Similarity

In [13]:
#create colorgrid for recommendation table
def color(x):
    #effect colors:
    if x == 'happy':
        return 'background-color: #5F9F9F'
    elif x == 'relaxed':
        return 'background-color: #C0D9D9'
    elif x == 'euphoric':
        return 'background-color: #79CDCD'
    elif x == 'uplifted':
        return 'background-color: #66CCCC'
    elif x == 'creative':
        return 'background-color: #37FDFC'
    elif x == 'energetic':
        return 'background-color: #00CDCD'
    elif x == 'focused':
        return 'background-color: #39B7CD'
    elif x == 'aroused':
        return 'background-color: #9AC0CD'
    elif x == 'sleepy':
        return 'background-color: #0099CC'
    elif x == 'hungry':
        return 'background-color: #6996AD'
    elif x == 'giggly':
        return 'background-color: #87CEFF'
    elif x == 'talkative':
        return 'background-color: #74BBFB'

    #medical colors:
    elif x == 'stress':
        return 'background-color: #458B00'
    elif x == 'depression':
        return 'background-color: #66CD00'
    elif x == 'pain':
        return 'background-color: #9CBA7F'
    elif x == 'insomnia':
        return 'background-color: #659D32'
    elif x == 'fatigue':
        return 'background-color: #BCED91'
    elif x == 'headaches':
        return 'background-color: #CFDBC5'
    elif x == 'eyepressure':
        return 'background-color: #567E3A'
    elif x == 'lackofappetite':
        return 'background-color: #84BE6A'
    elif x == 'inflammation':
        return 'background-color: #93DB70'
    elif x == 'cramps':
        return 'background-color: #86C67C'
    elif x == 'musclespasms':
        return 'background-color: #63AB62'  
    elif x == 'nausea':
        return 'background-color: #90EE90'
    elif x == 'spasticity':
        return 'background-color: #00CD00'
    elif x == 'seizures':
        return 'background-color: #F0FFF0'
    
    #negative colors:
    elif x == 'drymouth':
        return 'background-color: #EED2EE'
    elif x == 'dryeyes':
        return 'background-color: #DB70DB'
    elif x == 'anxious':
        return 'background-color: #CD00CD'  
    elif x == 'dizzy':
        return 'background-color: #FF00FF'
    elif x == 'paranoid':
        return 'background-color: #B5509C'
    elif x == 'headache':
        return 'background-color: #CDB5CD'
    
    #flavor colors:
    elif x == 'earthy':
        return 'background-color: #8B6508'
    elif x == 'sweet':
        return 'background-color: #ee918d'
    elif x == 'citrus':
        return 'background-color: #FFE700'
    elif x == 'berry':
        return 'background-color: #9b4466'
    elif x == 'diesel':
        return 'background-color: #696969'
    elif x == 'lemon':
        return 'background-color: #FFF44F'
    elif x == 'pine':
        return 'background-color: #01796f'
    elif x == 'blueberry':
        return 'background-color: #4f86f7'
    elif x == 'flowery':
        return 'background-color: #f4bfc7'
    elif x == 'pungent':
        return 'background-color: #808080'
    elif x == 'woody':
        return 'background-color: #554545'
    elif x == 'grape':
        return 'background-color: #6f2da8'
    elif x == 'spicy/herbal':
        return 'background-color: #FF0000'
    elif x == 'skunk':
        return 'background-color: #808080'
    elif x == 'cheese':
        return 'background-color: #FFF8DC'
    elif x == 'tropical':
        return 'background-color: #ff8aa1'
    elif x == 'orange':
        return 'background-color: #FFA500'
    elif x == 'pineapple':
        return 'background-color: #563c0d'
    elif x == 'strawberry':
        return 'background-color: #d53032'
    elif x == 'apple':
        return 'background-color: ##ff0800'
    elif x == 'chemical':
        return 'background-color: #778899'
    elif x == 'mango':
        return 'background-color: #ffcd48'
    elif x == 'pepper':
        return 'background-color: #2F4F4F'
    elif x == 'lavender':
        return 'background-color: #E6E6FA'
    elif x == 'coffee':
        return 'background-color: #6f4e37'
    elif x == 'mint':
        return 'background-color: #98ff98'
    elif x == 'honey':
        return 'background-color: #a98307'
    elif x == 'lime':
        return 'background-color: #00FF00'
    elif x == 'grapefruit':
        return 'background-color: #edadaa'
    elif x == 'vanilla':
        return 'background-color: #f3e5ab'
    elif x == 'sage':
        return 'background-color: #77815c'
    elif x == 'butter':
        return 'background-color: #fdf6c5'
    elif x == 'nutty':
        return 'background-color: #cd9141'
    elif x == 'bluecheese':
        return 'background-color: #87CEEB'
    elif x == 'tobacco':
        return 'background-color: #6d5843'
    elif x == 'plum':
        return 'background-color: #DDA0DD'
    elif x == 'pear':
        return 'background-color: #d1e231'
    elif x == 'violet':
        return 'background-color: #9400D3'
    elif x == 'tar':
        return 'background-color: #383838'
    elif x == 'menthol':
        return 'background-color: #c1f9a2 '
    elif x == 'ammonia':
        return 'background-color: #FFFF33'
    elif x == 'rose':
        return 'background-color: #ff007f '
    elif x == 'tea':
        return 'background-color: #832400'
    elif x == 'peach':
        return 'background-color: #ffe5b4'
    elif x == 'apricot':
        return 'background-color: #fbceb1'
    elif x == 'chestnut':
        return 'background-color: #954535'
    elif x == 'treefruit':
        return 'background-color: #8db600'
       
    #type colors:     
    elif x == 'sativa':
        return 'background-color: #cc5500'
    elif x == 'indica':
        return 'background-color: #800080'
    elif x == 'hybrid':
        return 'background-color: #758b72'
    
    #none and strain colors:xz
    elif x == 'none':
        return 'background-color: black'
    else:
        return 'background-color: white'

In [14]:
#creating a Series for the strains so they are associated to an ordered numerical
#list I will use in the function to match the indexes
#indices = pd.Series(master_df.index)

#defining the function that takes in strain 
#as input and returns the top 5 recommended strains
def recommended_strains(strain, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended strains
    recommended_strain_index = []
    
    # gettin the index of the strain that matches the strain
    idx = master_df[master_df['strain']==strain].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar strains
    top_5_indexes = list(score_series.iloc[1:6].index)
    
    # populating the list with the titles of the best 5 matching strains
    for i in top_5_indexes:
        recommended_strain_index.append(list(master_df.index)[i])
        #recommended_strains.append((unedited_df.ix[i]['strain']))
        #unedited_df.ix[id]['strain']
    #return recommended_strains

    recommended_strains = []
    
    for i in recommended_strain_index:
        recommended_strain = master_df.ix[i]['strain']
        recommended_strains.append(recommended_strain)
    
    return recommended_strains

print(recommended_strains('blue dream'))

# build table with inputted strain and 5 most similar

def build_comp_table_vec(df, strain):    
    strain_0 = df[df['strain'] == strain]
    strain_1 = df[df['strain'] == recommended_strains(strain)[0]]
    strain_2 = df[df['strain'] == recommended_strains(strain)[1]]
    strain_3 = df[df['strain'] == recommended_strains(strain)[2]]
    strain_4 = df[df['strain'] == recommended_strains(strain)[3]]
    strain_5 = df[df['strain'] == recommended_strains(strain)[4]]
    
    to_append = [strain_1, strain_2, strain_3, strain_4, strain_5]
    table = strain_0.append(to_append)

    return table

top_matches = build_comp_table_vec(master_df, 'blue dream')
top_matches = top_matches.iloc[:, :-1]
top_matches.style.applymap(color)

['cherry skunk', 'elvis', 'blueberry headband', 'white berry', 'bruce banner 3']


Unnamed: 0,strain,effect_1,effect_2,effect_3,effect_4,effect_5,medical_1,medical_2,medical_3,medical_4,medical_5,negative_1,negative_2,negative_3,negative_4,negative_5,flavor_1,flavor_2,flavor_3,category
309,blue dream,happy,relaxed,euphoric,uplifted,creative,stress,depression,pain,headaches,fatigue,drymouth,dryeyes,anxious,dizzy,paranoid,blueberry,sweet,berry,hybrid
563,cherry skunk,happy,relaxed,euphoric,creative,uplifted,pain,stress,depression,fatigue,headaches,drymouth,dryeyes,dizzy,anxious,paranoid,sweet,berry,earthy,hybrid
848,elvis,uplifted,relaxed,euphoric,happy,creative,stress,pain,depression,fatigue,headaches,anxious,dizzy,drymouth,paranoid,dryeyes,skunk,sweet,earthy,hybrid
370,blueberry headband,relaxed,happy,euphoric,uplifted,energetic,stress,pain,depression,lackofappetite,headaches,drymouth,dryeyes,dizzy,anxious,headache,blueberry,sweet,berry,hybrid
2523,white berry,relaxed,happy,sleepy,creative,euphoric,stress,pain,depression,lackofappetite,fatigue,drymouth,dryeyes,dizzy,paranoid,anxious,berry,sweet,blueberry,indica
419,bruce banner 3,happy,relaxed,euphoric,creative,uplifted,stress,depression,pain,fatigue,cramps,drymouth,dryeyes,anxious,dizzy,paranoid,sweet,diesel,earthy,hybrid


### Additional Data Retrieval

When scraping for parental lineage, I missed some parents. I needed to rescrape 
to get the additional information and in the process my ip got blocked.
I moved to working in smaller scrape batches. I then needed to append each of my smaller
batches into a master table.

(Since I was going to be scraping again, I also decided to scrape for the review counts.)

I didn't even end up using any of this data.

In [None]:
urls = pd.read_csv('urls.csv')
urls_list = urls['url']

urls_to_156 = urls_list[:156]                    #parents1.csv
urls_156_to_312 = urls_list[156:312]             #parents2.csv
urls_312_to_468 = urls_list[312:468]             #parents3.csv
urls_468_to_624 = urls_list[468:624]             #parents4.csv
urls_624_to_780 = urls_list[624:780]             #parents5.csv
urls_780_to_936 = urls_list[780:936]             #parents6.csv
urls_936_to_1092 = urls_list[936:1092]           #parents7.csv
urls_1092_to_1248 = urls_list[1092:1248]         #parents8.csv
urls_1248_to_1404 = urls_list[1248:1404]         #parents9.csv
urls_1404_to_1560 = urls_list[1404:1560]         #parents10.csv
urls_1560_to_2060 = urls_list[1560:2060]         #parents11.csv
urls_2060_to_2560 = urls_list[2060:2560]         #parents12.csv
urls_2560_to_3060 = urls_list[2560:3060]         #parents13.csv
urls_3060_to_end = urls_list[3060:]              #parents14.csv

In [None]:
#scrape for review totals and parents

parents = []
reviews = []

for url in tqdm(urls_list):
    html = requests.get(url).text
    soup = BeautifulSoup(html,'html.parser')

    try:
        container = soup.find("a",{"class":"active"})  
        strain_name = container.get('href') 
        strain = strain_name.split('/')
    except:
        None
    
    try:
        parent = soup.findAll('div',attrs={"class" : "strain-tile-footer"})
        parents.append(parent)
    except:
        parents.append(strain[2], None)
    
    try:
        container = soup.find("a",{"class":"active"})  
        strain_name = container.get('href') 
        strain = strain_name.split('/')
        review_total = soup.find('a',attrs={"onclick" : "window.strainHelpers.trackEvent('User Interactions', 'Strain Details', 'View all Reviews')"}).text
        review = review_total[review_total.find("(")+1:review_total.find(")")]
        rev_name = [strain[2], review]
    except:
        rev_name = [strain[2], None]
    
    reviews.append(rev_name)

time.sleep(1)

In [None]:
#scrape for parents
#change variable in 6 places for each small batch

parents14_ = []

for url in tqdm(urls_3060_to_end):
    html = requests.get(url).text
    soup = BeautifulSoup(html,'html.parser')

    try:

        children = soup.findAll('div',attrs={"class" : "strain-tile-footer"})
        parents14_.append(children)
    
    except:
        None

    time.sleep(2)        
 
parents14 = pd.DataFrame(parents14_)
parents14.to_csv('parents14.csv')

In [None]:
#create review df and save to csv
review_totals = pd.DataFrame(reviews)
review_totals.to_csv('review_totals.csv')

In [None]:
#create parent df and save to csv
parents_final = pd.DataFrame(parents)
parents_final.to_csv('parents_final.csv')

In [None]:
#minor cleanup
review_totals.rename(columns=({0:'strain', 1:'review_total'}), inplace=True)

In [None]:
#saved as csv and reread them in to notebook
review_totals.to_csv('review_totals.csv')
parents_final.to_csv('parents_final.csv')

review_totals = pd.read_csv('review_totals.csv')
parents_final = pd.read_csv('parents_final.csv')

In [None]:
#renamed to total_reviews 
total_reviews = pd.read_csv('review_totals.csv')

In [None]:
#minor cleanup
total_reviews.drop('Unnamed: 0', axis = 1, inplace=True)
total_reviews.drop('Unnamed: 0.1', axis = 1, inplace=True)

In [None]:
#minor cleanup
total_reviews['strain'] = total_reviews['strain'].str.replace('-',' ')

In [None]:
#save to csv
total_reviews.to_csv('total_reviews.csv')

In [None]:
parents = pd.read_csv('parents_final.csv')

In [None]:
final = pd.read_csv('final_df.csv')

In [None]:
#csvs created from scraping in smaller batches
#need to append these into master table
parent1a = pd.read_csv('parents1.csv')
parent2a = pd.read_csv('parents2.csv')
parent3a = pd.read_csv('parents3.csv')
parent4a = pd.read_csv('parents4.csv')
parent5a = pd.read_csv('parents5.csv')
parent6a = pd.read_csv('parents6.csv')
parent7a = pd.read_csv('parents7.csv')
parent8a = pd.read_csv('parents8.csv')
parent9a = pd.read_csv('parents9.csv')
parent10a = pd.read_csv('parents10.csv')
parent11a = pd.read_csv('parents11.csv')
parent12a = pd.read_csv('parents12.csv')
parent13a = pd.read_csv('parents13.csv')
parent14a = pd.read_csv('parents14.csv')

In [None]:
#parent list to append to master dataframe
append_list = [parent2a, parent3a, parent4a, parent5a, parent6a, parent7a,
              parent8a, parent9a, parent10a, parent11a, parent12a, parent13a,
              parent14a]             

#create master parent df
master_df = parent1a.append(append_list, ignore_index = True) 

In [None]:
#save to csv
# master_df.to_csv('master_df.csv')

In [None]:
#read in master parent df
master_df = pd.read_csv('master_df.csv')

In [None]:
#extract parent strain name from html
def split_string(string):
    try:
        return re.findall('<div class="strain-tile-footer">(.+)</div>', string)
    except:
        return 'n'

In [None]:
#clean each column in df
master_df['1'] = master_df['0'].apply(split_string)
master_df.replace('N, o, n, e', 'n')
master_df['1'] = master_df['1'].apply(', '.join)
master_df['1'] = [x.strip('$') for x in master_df['1']]
master_df['1'] = master_df['1'].str.lower()

In [None]:
def format_values(df, column):
    df[column] = df[column].apply(split_string)
    df[column] = df[column].apply(', '.join)
    df[column] = [x.strip('$') for x in df[column]]
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace("'",'')
    df[column] = df[column].str.replace(".",'')
    df[column] = df[column].str.replace("#",'')

In [None]:
#format_values(master_df, '0')
# format_values(master_df, '1')
# format_values(master_df, '2')
# format_values(master_df, '3')
# format_values(master_df, '4')
# format_values(master_df, '5')
# format_values(master_df, '6')
 format_values(master_df, '7')

In [None]:
#rename columns
master_df.rename(columns={'0': 'strain', '1':'parent_1', '2':'parent_2',
                         '3':'parent_3', '4':'parent_4', '5':'parent_5',
                         '6':'parent_6', '7':'parent_7'}, inplace=True)

In [None]:
#drop unnecessary column
master_df.drop('Unnamed: 0.1', axis=1, inplace=True)

In [None]:
#save to csv
# master_df.to_csv('master_parent.csv')

In [None]:
#merge total_reviews table to finals df
final_df = pd.merge(final_df, total_reviews, how='outer')

In [None]:
#drop unnecessary columns
final_df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
#drop duplicates from table
final_df.drop_duplicates(inplace=True)

In [None]:
#save final df to csv
# final_df.to_csv('final_df.csv')

In [None]:
#read in final df
final_df = pd.read_csv('final_df.csv')

In [None]:
#drop unnecessary columns
final_df.drop('parent_1', axis=1, inplace=True)
final_df.drop('parent_2', axis=1, inplace=True)

In [None]:
#read in master parent df to merge with final df
master_parent = pd.read_csv('master_parent.csv')

In [None]:
#merge master parent df to finals df
final_df = pd.merge(final_df, master_parent, on='strain')

In [None]:
#drop unnecessary columns and duplicates
final_df.drop('Unnamed: 0_y', axis=1, inplace=True)
final_df.drop('Unnamed: 0_x', axis=1, inplace=True)
final_df.drop_duplicates(inplace=True)

In [None]:
#save final df to csv
# final_df.to_csv('master_final.csv')