<a href="https://colab.research.google.com/github/RMDircio/DS/blob/master/machine_learning/bw_MedCabinet_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Nearest Neighbors model for prediction the top four marijuana strains for a consumer. 

This model structure was build from the great work done by Amy Beisel and Jessica Kimbril when they first worked on this project. 

In [1]:
# cleaning
import pandas as pd

# tokenizing
!python -m spacy download en_core_web_lg
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

# pickeling
import pickle

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


### Need to restart the Runtime here and rerun the imports cell above. 

# Cleaning Cannabis CSV

In [2]:
# reading data
URL = 'https://raw.githubusercontent.com/Build-Week-Med4/DS/master/machine_learning/cannabis.csv'
strains_df = pd.read_csv(URL, na_values="None")
strains_df.columns= strains_df.columns.str.lower()

In [3]:
# shape and first look at df
print(strains_df.shape)
strains_df.head()

(2351, 6)


Unnamed: 0,strain,type,rating,effects,flavor,description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
# locate all the NaNs
strains_df.loc[strains_df.isnull().any(axis=1)]

Unnamed: 0,strain,type,rating,effects,flavor,description
5,3-Bears-Og,indica,0.0,,,3 Bears OG by Mephisto Genetics is an autoflow...
16,831-Og,indica,5.0,,,"831 OG, aka Santa Cruz OG, is an indica-domina..."
32,Acid-Dough,sativa,5.0,"Talkative,Giggly,Happy,Hungry,Relaxed",,Acid Dough by Ripper Seeds is a sativa-dominan...
54,Afrikaner,sativa,0.0,,,Afrikaner by Cannabaal and master grower Waldo...
60,Als-Dream,hybrid,3.0,"Talkative,Energetic",,Al’s Dream is rapidly becoming a Colorado cann...
...,...,...,...,...,...,...
2298,White-Ryder,hybrid,0.0,,,White Ryder is Riot Seeds’ attempt to create a...
2309,White-Walrus,hybrid,2.5,"Talkative,Uplifted,Creative,Energetic,Euphoric",,"White Walrus is a phenotype of Chernobyl, grow..."
2321,Wonderwoman-Og,indica,4.0,"Relaxed,Tingly,Euphoric,Aroused,Happy","Flowery,Menthol,Citrus",
2327,X-File,hybrid,4.5,"Tingly,Euphoric,Relaxed",,Not to be confused with the similar-sounding X...


In [5]:
# drop the NaNs
strains_df = strains_df.dropna().reset_index()
print(strains_df.shape)
strains_df.head()

(2163, 7)


Unnamed: 0,index,strain,type,rating,effects,flavor,description
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


# Combining Features
`effects` + `flavors` + `description`
<br>By combining these three features, we can predict with full sentence inputs or a set of statements. 

In [6]:
# make combined feature
strains_df['combined'] = strains_df['effects'] + \
                          strains_df['flavor'] + \
                          strains_df['description']

In [7]:
# 7th row in combined column
# testing new column
strains_df['combined'][6]

'Relaxed,Happy,Euphoric,Uplifted,GigglyCitrus,Pungent,EarthyThe indica-dominant 303 OG is a Colorado strain bred by crossing Pre-98 Bubba Kush and Chemdawg. Its parent strains pass on a complex flavor profile of coffee and spice with diesel undertones. Its buzz can be described as social and creative, a dreamy experience perfect for letting off steam at the end of the work day. The happy euphoria induced by 303 OG may help patients treating depression, anxiety, and stress disorders.'

In [8]:
# drop extra index column
strains_df = strains_df.drop(['index'], axis=1)
strains_df.head()

Unnamed: 0,strain,type,rating,effects,flavor,description,combined
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,"Creative,Energetic,Tingly,Euphoric,RelaxedEart..."
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,"Relaxed,Aroused,Creative,Happy,EnergeticFlower..."
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,"Uplifted,Happy,Relaxed,Energetic,CreativeSpicy..."
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,"Tingly,Creative,Hungry,Relaxed,UpliftedApricot..."
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...","Happy,Relaxed,Euphoric,Uplifted,TalkativeCitru..."


In [9]:
# test effects column
strains_df['effects'][7]

'Uplifted,Focused,Happy,Talkative,Relaxed'

# Tokenize the Combined Column

In [10]:
# import spacy
# from spacy.tokenizer import Tokenizer
# from sklearn.neighbors import NearestNeighbors
# from sklearn.feature_extraction.text import TfidfVectorizer

# load neural network
nlp = spacy.load("en_core_web_lg")

# tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Make the tokens form description
combined_tokens = []
for txt in tokenizer.pipe(strains_df['combined'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    combined_tokens.append(txt_tokens)
strains_df['combined_tokens'] = combined_tokens
print(strains_df['combined_tokens'].head())

0    [Creative,Energetic,Tingly,Euphoric,RelaxedEar...
1    [Relaxed,Aroused,Creative,Happy,EnergeticFlowe...
2    [Uplifted,Happy,Relaxed,Energetic,CreativeSpic...
3    [Tingly,Creative,Hungry,Relaxed,UpliftedAprico...
4    [Happy,Relaxed,Euphoric,Uplifted,TalkativeCitr...
Name: combined_tokens, dtype: object


In [11]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) \
            and (token.is_punct != True)]

In [12]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english',
                       ngram_range = (1,2),
                       max_features = 2000)

# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(strains_df['combined'])
                         

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')
nn.fit(dtm)


# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2163, 2000)


Unnamed: 0,10,10 weeks,11,11 weeks,12,13,14,15,18,1980s,1st,1st hawaiian,1st place,20,20 indica,20 thc,2012,2014,2014 cannabis,2015,2016,22,23,24,25,2nd,2nd place,30,30 indica,3rd,3rd place,40,40 indica,40 sativa,45,47,50,50 50,50 hybrid,55,...,walker,want,warm,washington,washington state,way,way cross,week,week flowering,weeks,weight,weighted,west,west coast,whimsical,white,white alien,white og,white widow,wide,widow,winner,winning,won,won 1st,wonder,woody,woody spicy,work,works,world,worth,wowie,wrapped,wreck,years,yield,yielding,yields,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286626,0.15769,0.162149,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101565,0.132918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291018,0.0,0.0,0.371532,0.0,0.369437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138621,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.35166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# the user will give a sentance of desired effects
user_input_effects = ['I would like to feel happy']

In [14]:
# Query the ideal descprition
output_results = tfidf.transform(user_input_effects)
output_results

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [15]:
# get the model predictions array
nn.kneighbors(output_results.todense())

(array([[1.19841614, 1.2133686 , 1.22489587, 1.24692505]]),
 array([[ 440, 2154, 1630, 1177]]))

In [36]:
# top four predictions

'''
would like to turn this all into a for loop

def prediction_results(first,second,thrid,fourth):
  take the predictions via the parameters and prints
  out the strain and the discription
'''

wrapper = textwrap.TextWrapper(width = 75)

# first strain
first_name = strains_df['strain'][440]
first_review = strains_df['description'][440]
first_review = wrapper.fill(text=first_review)
print(colored('Strain Name: ','green'),first_name + '\n', 
      colored(first_review, 'grey'))
print()

# second strain
second_name = strains_df['strain'][2154]
second_review = strains_df['description'][2154]
second_review = wrapper.fill(text=first_review)
print(colored('Strain Name: ','green'),second_name + '\n',
      colored(second_review, 'grey'))
print()

# third strain
third_name = strains_df['strain'][1630]
third_review = strains_df['description'][1630]
third_review = wrapper.fill(text=first_review)
print(colored('Strain Name: ','green'),third_name + '\n',
      colored(third_review, 'grey'))
print()

# fourth strain
fourth_name = strains_df['strain'][1177]
fourth_review = strains_df['description'][1177]
fourth_review = wrapper.fill(text=first_review)
print(colored('Strain Name: ','green'),fourth_name + '\n',
      colored(fourth_review, 'grey'))

[32mStrain Name: [0m Cheese-Quake
 [30mCheese Quake by TGA Subcool Seeds is a 60/40 indica-dominant cross
between Cheese and Querkle.  A hint of grape is detectable, but its aroma
is led by the funky sour aroma commonly associated with Cheese hybrids.
Expect to feel tingy invigoration followed by uplifting body sensations to
help you feel relaxed and carefree.[0m

[32mStrain Name: [0m Yummy
 [30mCheese Quake by TGA Subcool Seeds is a 60/40 indica-dominant cross
between Cheese and Querkle.  A hint of grape is detectable, but its aroma
is led by the funky sour aroma commonly associated with Cheese hybrids.
Expect to feel tingy invigoration followed by uplifting body sensations to
help you feel relaxed and carefree.[0m

[32mStrain Name: [0m Redwood-Kush
 [30mCheese Quake by TGA Subcool Seeds is a 60/40 indica-dominant cross
between Cheese and Querkle.  A hint of grape is detectable, but its aroma
is led by the funky sour aroma commonly associated with Cheese hybrids.
Expect to 

# Pickle Time

In [25]:
# import pickle
# Dump the trained classifier (nn)  with Pickle
pickle_filename = 'model.pkl2'
pickled_model = open(pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(nn, pickled_model)
pickled_model.close() # Close the pickle instances

In [26]:
# Loading the saved model
model_pkl2 = open(pickle_filename, 'rb')
NN_model2 = pickle.load(model_pkl2)
print ("Loaded model :: ", NN_model2)  # print to verify

Loaded model ::  NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                 radius=1.0)


In [27]:
# Dump the trained classifier (tfidf)  with Pickle
pickle_filename_1 = 'tfidf.pkl2'
pickled_model_1 = open(pickle_filename_1, 'wb')  # Open the file to save as pkl file
pickle.dump(tfidf, pickled_model_1)
pickled_model_1.close() # Close the pickle instances

In [28]:
# Loading the saved model
model_pkl_1 = open(pickle_filename_1, 'rb')
tfidf_model2 = pickle.load(model_pkl_1)
print ("Loaded model :: ", tfidf_model2)  # print to verify

Loaded model ::  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [29]:
user_input_effects2 = ["happy,Sleepy,Apricot,Citrus,Grapefruit"]

In [31]:
output_results2 = tfidf_model2.transform(user_input_effects2)
output_results2

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [32]:
# get the model predictions array
NN_model2.kneighbors(output_results2.todense())

(array([[1.09504845, 1.14089606, 1.15844355, 1.18178543]]),
 array([[1911,  398,  863, 1497]]))

In [33]:
# get predictions into JSON format for web team

import json
def recommend(user_input):
    temp_df = NN_model2.kneighbors(tfidf_model2.transform([user_input]).todense())[1]
    

    #print(temp_df)
    
    for i in range(4):
        info = strains_df.loc[temp_df[0][i]]['strain']
        info_effects = strains_df.loc[temp_df[0][i]]['effects']
        info_flavor = strains_df.loc[temp_df[0][i]]['flavor']
        info_description = strains_df.loc[temp_df[0][i]]['description']
        info_rating = strains_df.loc[temp_df[0][i]]['rating']
        
        print(json.dumps(info))
        print(json.dumps(info_effects))
        print(json.dumps(info_flavor))
        print(json.dumps(info_description))
        print(json.dumps(info_rating))
        
        #return json.dumps(info)  #for engineeers, the return does not work in jupyter lab.  Should work in vsCode.
        #return json.dumps(info_effects)
        #return json.dumps(info_flavor)
        #return json.dump(info_description)
        #return json.dumps(info_rating)

In [35]:
recommend('I want to feel happy')

"Sfv-Og"
"Happy,Relaxed,Uplifted,Euphoric,Focused"
"Earthy,Pine,Pungent"
"SFV OG by Cali Connection\u00a0is a sativa-dominant hybrid that is great for patients who need strong pain relief but don\u2019t want to be stuck on the couch. As the name indicates, this OG Kush relative\u00a0originates from California\u2019s San Fernando Valley. Although their names are barely distinguishable, SFV OG Kush is actually the Afghani-crossed\u00a0child to SFV OG.\u00a0The flowers will stay fairly small and may not look super impressive, but what it lacks in looks it makes up for in smell. The pine and lemon earthiness that SFV exudes can be a bit harsh and bitter if smoked, so this strain is one to save for the vaporizer if consuming directly. Its indica body effects take a little longer to feel than the initial head haziness, but once they do, everything will feel fine."
4.4
"Cheese-Quake"
"Euphoric,Uplifted,Relaxed,Tingly,Happy"
"Ammonia,Citrus,Rose"
"Cheese Quake by TGA Subcool Seeds is a 60/40 i

# Extra future features testing

# Medical Predictions
New dataset for a future feature, tokenize the `ailments` feature

In [None]:
# upload the medical.csv

URL = 'https://raw.githubusercontent.com/Build-Week-Med4/DS/master/machine_learning/medical.csv'
medical = pd.read_csv(URL)
medical.columns= medical.columns.str.lower()
medical = medical.rename(columns={'alments' : 'ailments'})

print(medical.shape)
medical.head()

In [None]:
# check for NaNs 
medical.isnull().sum()

In [None]:
# drop unnecessary columns
medical = medical.drop(columns=['rating', 'effects','flavor', 'description'])
medical.head()

# Tokenize the Alemnts Column

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_lg")

# The Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Make the tokens for description
ailments_tokens = []
for txt in tokenizer.pipe(medical['ailments'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    ailments_tokens.append(txt_tokens)
medical['ailments_tokens'] = ailments_tokens
print(medical['ailments_tokens'].head())

In [None]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) \
            and (token.is_punct != True)]

In [None]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english',
                       ngram_range = (1,2),
                       max_features = 2000)

# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(medical['ailments'])
                         

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')
nn.fit(dtm)


# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

In [None]:
user_input = ['''seizures,stress''']

In [None]:
# Query the ideal descprition
results = tfidf.transform(ideal)
results

In [None]:
nn.kneighbors(new.todense())

In [None]:
# most ideal description,strain and flavor
medical['strain'][39]

### Pickel Medical Ailments

In [None]:
import pickle
# Dump the trained classifier (nn)  with Pickle
pickle_filename = 'ailments.pkl2'
pickled_ailments = open(pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(nn, pickled_ailments)
pickled_ailments.close() # Close the pickle instances

In [None]:
# Loading the saved model
ailments_pkl2 = open(pickle_filename, 'rb')
ailemnts_vector = pickle.load(ailments_pkl2)
print ("Loaded model :: ", ailemnts_vector)  # print to verify