# Cosine Similarity Model

#### Alternatve model for medicine cabinet project using Cosine Similarity to predict cannabis strain recommendations based on user input.

In [43]:
# Imports

import pandas as pd
import numpy as np

In [44]:
# Loading preprocessed data - Courtesy Curdt Million

df = pd.read_csv("lemmatized_strains.csv")

In [45]:
df.head()

Unnamed: 0.1,Unnamed: 0,Strain,Type,Effects,Flavor,Description,name,ailment,all_text,lemmas
0,0,100-Og,hybrid,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,,,"100-Og hybrid Creative,Energetic,Tingly,Euphor...",100-Og hybrid Creative energetic Tingly Euphor...
1,1,98-White-Widow,hybrid,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,,,"98-White-Widow hybrid Relaxed,Aroused,Creative...",98-white widow hybrid Relaxed arouse Creative ...
2,2,1024,sativa,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1024.0,"Stress, Pain, Depression, Inflammation","1024 sativa Uplifted,Happy,Relaxed,Energetic,C...",1024 sativa Uplifted happy relaxed energetic C...
3,3,13-Dawgs,hybrid,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,,,"13-Dawgs hybrid Tingly,Creative,Hungry,Relaxed...",13-dawgs hybrid Tingly creative hungry Relaxed...
4,4,24K-Gold,hybrid,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",,,"24K-Gold hybrid Happy,Relaxed,Euphoric,Uplifte...",24k gold hybrid Happy Relaxed Euphoric Uplifte...


In [46]:
# Dropping unnamed columns & resetting index

df = df.drop('Unnamed: 0', axis=1).reset_index()

In [47]:
df.head()

Unnamed: 0,index,Strain,Type,Effects,Flavor,Description,name,ailment,all_text,lemmas
0,0,100-Og,hybrid,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,,,"100-Og hybrid Creative,Energetic,Tingly,Euphor...",100-Og hybrid Creative energetic Tingly Euphor...
1,1,98-White-Widow,hybrid,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,,,"98-White-Widow hybrid Relaxed,Aroused,Creative...",98-white widow hybrid Relaxed arouse Creative ...
2,2,1024,sativa,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1024.0,"Stress, Pain, Depression, Inflammation","1024 sativa Uplifted,Happy,Relaxed,Energetic,C...",1024 sativa Uplifted happy relaxed energetic C...
3,3,13-Dawgs,hybrid,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,,,"13-Dawgs hybrid Tingly,Creative,Hungry,Relaxed...",13-dawgs hybrid Tingly creative hungry Relaxed...
4,4,24K-Gold,hybrid,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",,,"24K-Gold hybrid Happy,Relaxed,Euphoric,Uplifte...",24k gold hybrid Happy Relaxed Euphoric Uplifte...


## Vectorizing Words with TF/IDF Vectorizer to Create Document Term Matrix (DTM)

In [48]:
# Instantiating TfidfVectorizer

tfidf = TfidfVectorizer(max_features=500)

In [49]:
# Creating DTM

dtm = tfidf.fit_transform(df['lemmas'])

In [50]:
# Loading DTM to DataFrame

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [51]:
dtm.head()

Unnamed: 0,10,11,1st,20,2014,47,50,60,70,80,...,west,white,widow,win,wonder,woody,work,world,wreck,yield
0,0.0,0.0,0.0,0.0,0.0,0.0,0.43701,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.119256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.42906,0.546006,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.143672,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.167912,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13948,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
dtm.shape

(2280, 500)

## Applying Cosine Similarity

In [53]:
# Calculating distance of vectors

from sklearn.metrics.pairwise import cosine_similarity

dist_matrix  = cosine_similarity(dtm)

In [54]:
dist_matrix.shape

(2280, 2280)

In [55]:
dist_matrix

array([[1.        , 0.05662412, 0.10575905, ..., 0.03010507, 0.22571093,
        0.03071383],
       [0.05662412, 1.        , 0.07116044, ..., 0.08763689, 0.03961352,
        0.0304609 ],
       [0.10575905, 0.07116044, 1.        , ..., 0.12922749, 0.03859133,
        0.05931638],
       ...,
       [0.03010507, 0.08763689, 0.12922749, ..., 1.        , 0.13990653,
        0.05551409],
       [0.22571093, 0.03961352, 0.03859133, ..., 0.13990653, 1.        ,
        0.09627927],
       [0.03071383, 0.0304609 , 0.05931638, ..., 0.05551409, 0.09627927,
        1.        ]])

In [56]:
# Incorporating distance matrix into dataframe

df_cos = pd.DataFrame(dist_matrix)

In [57]:
df_cos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2270,2271,2272,2273,2274,2275,2276,2277,2278,2279
0,1.0,0.056624,0.105759,0.165992,0.148234,0.288633,0.147178,0.161991,0.045953,0.149626,...,0.163225,0.08953,0.117239,0.09368,0.067265,0.311008,0.093477,0.030105,0.225711,0.030714
1,0.056624,1.0,0.07116,0.070885,0.088638,0.024583,0.073783,0.117855,0.040816,0.02304,...,0.113528,0.045887,0.011955,0.041245,0.031692,0.023827,0.049292,0.087637,0.039614,0.030461
2,0.105759,0.07116,1.0,0.235975,0.080283,0.018719,0.205385,0.08546,0.163001,0.094368,...,0.138462,0.124053,0.171028,0.164401,0.30712,0.123086,0.165001,0.129227,0.038591,0.059316
3,0.165992,0.070885,0.235975,1.0,0.110496,0.106293,0.111014,0.224242,0.20607,0.148101,...,0.203264,0.155834,0.144307,0.170092,0.110797,0.112823,0.214803,0.094309,0.087834,0.137989
4,0.148234,0.088638,0.080283,0.110496,1.0,0.074858,0.06499,0.06582,0.082988,0.103058,...,0.086995,0.113628,0.077833,0.092577,0.051688,0.041197,0.047969,0.170361,0.057181,0.038856


In [58]:
df_cos.shape

(2280, 2280)

In [59]:
df.lemmas[0]

'100-Og hybrid Creative energetic Tingly Euphoric Relaxed Earthy Sweet Citrus $ 100 og 50/50 hybrid strain pack strong punch supposedly refer strength high price start show Hollywood plant $ 100 og tend produce large dark green bud stem user report strong body effect indica pain relief alert cerebral feeling thank sativa  '

In [207]:
df_cos[df_cos[4]<1][0]

# .sort_values(ascending=False)

0       1.000000
1       0.056624
2       0.105759
3       0.165992
5       0.288633
          ...   
2275    0.311008
2276    0.093477
2277    0.030105
2278    0.225711
2279    0.030714
Name: 0, Length: 2279, dtype: float64

In [203]:
df_cos[df_cos[4] < 1 ][0].sort_values(ascending=False)[1:6]

2137    0.487445
670     0.458379
1384    0.445955
1837    0.429331
1361    0.398407
Name: 0, dtype: float64

In [148]:
# Top 5 Similar Strains

df_cos[0][1:6]

1    0.056624
2    0.105759
3    0.165992
4    0.148234
5    0.288633
Name: 0, dtype: float64

In [149]:
# Top 5 similar strains to strain index_no. 0 (100-OG)

OG100_top_5 = df_cos[df_cos[0] < 1][0].sort_values(ascending=False)[:5]

In [150]:
OG100_top_5

2137    0.487445
670     0.458379
1384    0.445955
1837    0.429331
1361    0.398407
Name: 0, dtype: float64

In [121]:
type(OG100_top_5)

pandas.core.series.Series

In [171]:
# Converting to dict

OG100_top_5 = dict(OG100_top_5)

In [172]:
type(OG100_top_5)

dict

In [173]:
OG100_top_5

{2137: 0.4874450768727173,
 670: 0.45837886308913506,
 1384: 0.445954630280887,
 1837: 0.42933082079619833,
 1361: 0.3984066125237562}

In [176]:
OG100_top_5

{2137: 0.4874450768727173,
 670: 0.45837886308913506,
 1384: 0.445954630280887,
 1837: 0.42933082079619833,
 1361: 0.3984066125237562}

In [193]:
OG100_top_5.keys()

dict_keys([2137, 670, 1384, 1837, 1361])

In [124]:
# Top 5 indexes in DataFrame

df.iloc[[i for i in OG100_top_5], 1:6]

Unnamed: 0,Strain,Type,Effects,Flavor,Description
2137,Tuna-Og,hybrid,,,Tuna OG (or OG Tuna Kush) is a 50/50 hybrid cr...
670,Donna-Og,hybrid,"Creative,Euphoric,Relaxed,Happy,Hungry","Citrus,Pungent,Sweet",Donna OG is a 50/50 hybrid cross between two O...
1384,Monster-Og,hybrid,"Hungry,Relaxed,Happy,Tingly,Sleepy","Woody,Spicy/Herbal,Earthy",Monster OG from The Green Element is a hybrid ...
1837,Skywalker-Og,hybrid,"Relaxed,Euphoric,Happy,Uplifted,Sleepy","Earthy,Pungent,Sweet",Skywalker OG is a potent indica-dominant hybri...
1361,Merlot-Og,indica,"Relaxed,Happy,Uplifted,Euphoric,Tingly","Butter,Flowery,Earthy",SoCal cannabis breeders Ocean Grown Seeds orig...


## Creating Function to Calculate Closest Related Strains to Each Corresponding Strain

In [167]:
# Creating Lookup Table DataFrame

df_names = df[['index', 'Strain']]

In [168]:
df_names.head()

Unnamed: 0,index,Strain
0,0,100-Og
1,1,98-White-Widow
2,2,1024
3,3,13-Dawgs
4,4,24K-Gold


In [208]:
def cos_sim(x):
    """Returns top 5 similar strain indexes corresponding to input index"""
    top_5 = dict(df_cos[df_cos[x] < 1][x].sort_values(ascending=False)[:5])
    
    return top_5.keys()


## Creating New Column in DataFrame for Strains with High Cosine Similarities

In [209]:
df_strains = df.copy()

In [210]:
df_strains.head()

Unnamed: 0,index,Strain,Type,Effects,Flavor,Description,name,ailment,all_text,lemmas
0,0,100-Og,hybrid,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,,,"100-Og hybrid Creative,Energetic,Tingly,Euphor...",100-Og hybrid Creative energetic Tingly Euphor...
1,1,98-White-Widow,hybrid,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,,,"98-White-Widow hybrid Relaxed,Aroused,Creative...",98-white widow hybrid Relaxed arouse Creative ...
2,2,1024,sativa,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1024.0,"Stress, Pain, Depression, Inflammation","1024 sativa Uplifted,Happy,Relaxed,Energetic,C...",1024 sativa Uplifted happy relaxed energetic C...
3,3,13-Dawgs,hybrid,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,,,"13-Dawgs hybrid Tingly,Creative,Hungry,Relaxed...",13-dawgs hybrid Tingly creative hungry Relaxed...
4,4,24K-Gold,hybrid,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",,,"24K-Gold hybrid Happy,Relaxed,Euphoric,Uplifte...",24k gold hybrid Happy Relaxed Euphoric Uplifte...


In [211]:
df_strains.columns

Index(['index', 'Strain', 'Type', 'Effects', 'Flavor', 'Description', 'name',
       'ailment', 'all_text', 'lemmas'],
      dtype='object')

In [None]:
df_strains['Cos Sim Strains'] = [cos_sim(i) for i in df['index']]

In [192]:
df_strains.sample(5)

Unnamed: 0,index,Strain,Type,Effects,Flavor,Description,name,ailment,all_text,lemmas,Cos Sim Strains
1308,1308,Mammoth,hybrid,"Happy,Relaxed,Focused,Uplifted,Creative","Honey,Earthy,Woody",This hybrid from The Bank Cannabis Genetics is...,,,"Mammoth hybrid Happy,Relaxed,Focused,Uplifted,...",mammoth hybrid Happy Relaxed Focused Uplifted ...,"(0, 2137, 670, 1384, 1837)"
834,834,Garlic-Bud,indica,"Relaxed,Happy,Creative,Uplifted,Tingly","Earthy,Woody,Pepper",Garlic Bud is a classic indica strain from the...,,,"Garlic-Bud indica Relaxed,Happy,Creative,Uplif...",Garlic Bud indica Relaxed happy Creative Uplif...,"(0, 2137, 670, 1384, 1837)"
98,98,Alpine-Star,indica,"Relaxed,Uplifted,Happy,Euphoric,Tingly","Pine,Citrus,Earthy","Alpine Star, often labeled as Alpine OG, is an...",,,"Alpine-Star indica Relaxed,Uplifted,Happy,Euph...",Alpine Star indica Relaxed Uplifted happy Euph...,"(0, 2137, 670, 1384, 1837)"
1812,1812,Silver-Kush,sativa,"Happy,Relaxed,Giggly,Uplifted,Focused","Earthy,Chemical,Citrus",Silver Kush is a sativa-dominant mix of Silver...,,,"Silver-Kush sativa Happy,Relaxed,Giggly,Uplift...",silver Kush sativa Happy Relaxed giggly Uplift...,"(0, 2137, 670, 1384, 1837)"
744,744,Elephant,sativa,"Sleepy,Happy,Euphoric,Focused,Aroused","Sweet,Earthy,Tropical",Elephant is an old school bud with a lineage l...,,,"Elephant sativa Sleepy,Happy,Euphoric,Focused,...",Elephant sativa Sleepy happy Euphoric Focused ...,"(0, 2137, 670, 1384, 1837)"


## Creating Test DataBase in MongoDB

## Testing User Input on Cosine Similarity Model

In [133]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [134]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [135]:
# Function for Tokenizing User Input

def tokenize(document):
    doc = nlp(document)
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [136]:
user_input_raw = "uplifting strain for depression"

In [137]:
# Tokenizing User Input

user_input = tokenize(user_input_raw)

In [138]:
# Converting user input into vectors for DTM

dtm_ui = tfidf.fit_transform(user_input)

In [141]:
dtm_ui = pd.DataFrame(dtm_ui.todense(), columns=tfidf.get_feature_names())

In [142]:
dtm_ui

Unnamed: 0,depression,strain,uplifting
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0


SyntaxError: invalid syntax (<ipython-input-147-40cce11ae824>, line 1)