In [46]:
"""
Import Statements
"""

# Classics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Base
import re
import os
import gensim
from collections import Counter

# CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classification
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# NLP Libraries
import spacy
from nltk.stem import PorterStemmer
from spacy.tokenizer import Tokenizer

# Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


# Topic Modeling
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.ldamulticore import LdaMulticore

# Topic Distance Visualiztion
import pyLDAvis.gensim

nlp = spacy.load("en_core_web_lg")

In [47]:
pwd

'/Users/jorge/Med-Cabinet-2/Data'

In [48]:
ls

cannabis.csv   medcab.csv     medcab1_2.csv


In [50]:
df = pd.read_csv('/Users/jorge/Med-Cabinet-2/Data/cannabis.csv')
df.head(2)

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...


In [51]:
good_stuff = df[df['Rating'] >= 4.0]
good_stuff.shape

(2162, 6)

In [52]:
# replace blank flavor with ""
good_stuff = df.replace(np.nan, '', regex=True)

In [53]:
def clean_string(strng):
    s = strng.replace(","," ") # comma-> space
    s = s.replace("("," ") # (-> space
    s = s.replace(")"," ") # (-> space
    s = s.lower()
    return s

In [56]:
# cols = good_stuff.columns
cols = ['Type', 'Effects', 'Flavor', 'Description']

for col in cols:
    good_stuff[col] =good_stuff[col].apply(clean_string)

good_stuff['text'] = good_stuff['Type'] + " " + good_stuff['Effects'] + " " + good_stuff['Flavor']

In [60]:
good_stuff.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,text
0,100-Og,hybrid,4.0,creative energetic tingly euphoric relaxed,earthy sweet citrus,$100 og is a 50/50 hybrid strain that packs a ...,hybrid creative energetic tingly euphoric rela...
1,98-White-Widow,hybrid,4.7,relaxed aroused creative happy energetic,flowery violet diesel,the ‘98 aloha white widow is an especially pot...,hybrid relaxed aroused creative happy energeti...
2,1024,sativa,4.4,uplifted happy relaxed energetic creative,spicy/herbal sage woody,1024 is a sativa-dominant hybrid bred in spain...,sativa uplifted happy relaxed energetic creati...
3,13-Dawgs,hybrid,4.2,tingly creative hungry relaxed uplifted,apricot citrus grapefruit,13 dawgs is a hybrid of g13 and chemdawg genet...,hybrid tingly creative hungry relaxed uplifted...
4,24K-Gold,hybrid,4.6,happy relaxed euphoric uplifted talkative,citrus earthy orange,also known as kosher tangie 24k gold is a 60%...,hybrid happy relaxed euphoric uplifted talkati...


## Tokenize Function:
<a id="#p1"></a>

In [63]:
#Tokenizer Pipe

tokens = []

# Make the tokens
for doc in nlp.pipe(good_stuff['text'], disable=['tagger', 'parser', 'ner']):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop==False) & (token.is_punct==False):
           doc_tokens.append(token.text)
   
    tokens.append(doc_tokens)
    
good_stuff['tokens'] = tokens

## Text Preprocessing

In [64]:
 def tokenize(doc):
    nlp_doc = nlp(doc.lower())
    doc_tokens = [token.text for token in nlp_doc
    if (token.is_stop==False) and (token.is_punct==False)
    and token.pos_!="-PRON-" and token.text != " "]
    
    return doc_tokens

In [65]:
tokenize(good_stuff['text'][0])

['hybrid',
 'creative',
 'energetic',
 'tingly',
 'euphoric',
 'relaxed',
 'earthy',
 'sweet',
 'citrus']

In [66]:
"this is a sample string with a \n newline character".replace('\n', '')

'this is a sample string with a  newline character'

In [67]:
# Lets apply the above and creat our series of tokenized texts:
good_stuff['tokens'] = good_stuff['text'].apply(lambda x: tokenize(x))
good_stuff['tokens'].head()

0    [hybrid, creative, energetic, tingly, euphoric...
1    [hybrid, relaxed, aroused, creative, happy, en...
2    [sativa, uplifted, happy, relaxed, energetic, ...
3    [hybrid, tingly, creative, hungry, relaxed, up...
4    [hybrid, happy, relaxed, euphoric, uplifted, t...
Name: tokens, dtype: object

In [73]:
STOPWORDS = set(STOPWORDS).union(set(['said', 'mr', 'mrs']))

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [74]:
tokenize("Hello World! This a test of the tokenization method")

['hello', 'world', 'test', 'tokenization', 'method']

In [75]:
len(tokens)

2351

In [36]:
tokenize("Hello World! This a test of the tokenization method")

In [77]:
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

# Update it based on a split of each of our documents
good_stuff['tokens'].apply(lambda x: word_counts.update(x))

# Print out the 10 most common words
word_counts.most_common(10)

[('happy', 1871),
 ('relaxed', 1726),
 ('euphoric', 1635),
 ('uplifted', 1507),
 ('hybrid', 1212),
 ('earthy', 1105),
 ('sweet', 1053),
 ('creative', 747),
 ('sleepy', 738),
 ('indica', 699)]

### Vector Representation

In [78]:
# instantiate vectorizer object:
tfidf = TfidfVectorizer(max_df=.97,
                        min_df=3,
                        stop_words='english')

# create a vocabulary and get word counts per document:
dtm_tfidf = tfidf.fit_transform(good_stuff['text'])

# print word counts

# get feature names to use as dataframe column headers:
tf_good_stuff = pd.DataFrame(dtm_tfidf.todense(), columns=tfidf.get_feature_names())

# view Feature Matrix as DataFrame:
tf_good_stuff.head()

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.477579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.35866,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691871,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1608,0.0,0.0,0.358211
3,0.0,0.0,0.645008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.290903,0.0,0.0,0.0,0.144217,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.210917,0.0,0.0,0.0


In [80]:
# fit on dtm_tfidf:
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(tf_good_stuff)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [81]:
nn.kneighbors([tf_good_stuff.iloc[0]])

(array([[0.        , 0.67125039, 0.68429225, 0.69605859, 0.70552725]]),
 array([[   0,   81, 1286,  429, 1143]]))

In [83]:
good_stuff['text'][:10]

0    hybrid creative energetic tingly euphoric rela...
1    hybrid relaxed aroused creative happy energeti...
2    sativa uplifted happy relaxed energetic creati...
3    hybrid tingly creative hungry relaxed uplifted...
4    hybrid happy relaxed euphoric uplifted talkati...
5                                     indica none none
6    hybrid relaxed euphoric happy uplifted hungry ...
7    indica relaxed happy euphoric uplifted giggly ...
8    sativa uplifted focused happy talkative relaxe...
9    indica relaxed tingly happy euphoric uplifted ...
Name: text, dtype: object

In [87]:
import re, string

def punct_cleaning(column):
    processed_data = []
    for i in column:
        text = re.sub('[%s]' % string.punctuation, '', i).lower()
        processed_data.append(text)
    return pd.DataFrame(processed_data)

In [88]:
good_stuff['text'] = punct_cleaning(good_stuff['text'])
good_stuff['text'][10]

'hybrid energetic happy talkative uplifted pine diesel'

In [89]:
text = ['hybrid energetic happy talkative uplifted pine diesel']

In [90]:
good_stuff_text = tfidf.transform(text)

In [91]:
nn.kneighbors(good_stuff_text.todense())

(array([[0.        , 0.33640584, 0.67665231, 0.69700992, 0.73110367]]),
 array([[  10,   76,  493, 2027, 1506]]))

In [93]:
for num in [  10,   76,  493, 2027, 1506]:
    print(good_stuff['text'][num])
    print('\n')

hybrid energetic happy talkative uplifted pine diesel


hybrid happy talkative euphoric uplifted energetic pine diesel earthy


hybrid uplifted happy euphoric relaxed energetic diesel pine pungent


hybrid relaxed happy uplifted energetic talkative earthy pine sweet


hybrid euphoric energetic creative relaxed happy earthy pine diesel




### Classification

In [94]:
good_stuff.dtypes

Strain          object
Type            object
Rating         float64
Effects         object
Flavor          object
Description     object
text            object
tokens          object
dtype: object

In [95]:
doc = nlp("Two bananas in pyjamas")

In [96]:
bananas_vector = doc.vector
print(len(bananas_vector))

300


In [97]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [99]:
# Apply to your Dataset

param_dist = {
    
    'max_depth' : randint(3,10),
    'min_samples_leaf': randint(2,15)
}

In [104]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
rfc = RandomForestClassifier()

# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect),
                 # Classifier
                 ('clf', rfc)
                ])

# The pipeline puts together a bunch fit then transform,fit then predict. 

In [114]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)

In [134]:
# grid_search.fit(good_stuff['text'], good_stuff['Rating'])

###  Topic Modeling

In [121]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary

In [122]:
good_stuff['tokens'].head()

0    [hybrid, creative, energetic, tingly, euphoric...
1    [hybrid, relaxed, aroused, creative, happy, en...
2    [sativa, uplifted, happy, relaxed, energetic, ...
3    [hybrid, tingly, creative, hungry, relaxed, up...
4    [hybrid, happy, relaxed, euphoric, uplifted, t...
Name: tokens, dtype: object

### Learn the vocubalary of the good_stuff data:

In [123]:
id2word = corpora.Dictionary(good_stuff['tokens'])

In [124]:
corpus = [id2word.doc2bow(text)for text in good_stuff['tokens']]

In [133]:
# lda = LdaMulticore(corpus=corpus,
#                   id2word=id2word,
#                   iterations=5,
#                   workers=4,
#                   num_topics = 10 # You can change this parameter
#                  )

In [127]:
lda.print_topics()

[(0,
  '0.095*"relaxed" + 0.080*"happy" + 0.072*"earthy" + 0.063*"uplifted" + 0.059*"indica" + 0.058*"euphoric" + 0.047*"sleepy" + 0.041*"hybrid" + 0.037*"creative" + 0.030*"energetic"'),
 (1,
  '0.077*"relaxed" + 0.075*"earthy" + 0.074*"uplifted" + 0.058*"euphoric" + 0.054*"hybrid" + 0.052*"sleepy" + 0.051*"happy" + 0.043*"sweet" + 0.042*"focused" + 0.041*"hungry"'),
 (2,
  '0.118*"happy" + 0.100*"relaxed" + 0.078*"uplifted" + 0.071*"euphoric" + 0.067*"sweet" + 0.066*"hybrid" + 0.044*"creative" + 0.043*"earthy" + 0.035*"sleepy" + 0.029*"tingly"'),
 (3,
  '0.107*"happy" + 0.091*"relaxed" + 0.091*"euphoric" + 0.076*"earthy" + 0.072*"hybrid" + 0.067*"uplifted" + 0.044*"sleepy" + 0.039*"indica" + 0.032*"creative" + 0.026*"sweet"'),
 (4,
  '0.118*"hybrid" + 0.089*"relaxed" + 0.078*"uplifted" + 0.078*"euphoric" + 0.063*"earthy" + 0.058*"sweet" + 0.050*"happy" + 0.043*"creative" + 0.038*"focused" + 0.027*"sleepy"'),
 (5,
  '0.120*"euphoric" + 0.074*"uplifted" + 0.072*"relaxed" + 0.069*"happy

In [128]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [129]:
topics = [' '.join(t[0:5]) for t in words]

In [130]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
relaxed happy earthy uplifted indica

------ Topic 1 ------
relaxed earthy uplifted euphoric hybrid

------ Topic 2 ------
happy relaxed uplifted euphoric sweet

------ Topic 3 ------
happy relaxed euphoric earthy hybrid

------ Topic 4 ------
hybrid relaxed uplifted euphoric earthy

------ Topic 5 ------
euphoric uplifted relaxed happy hybrid

------ Topic 6 ------
uplifted relaxed euphoric happy hybrid

------ Topic 7 ------
happy euphoric uplifted relaxed sweet

------ Topic 8 ------
euphoric happy sweet relaxed hybrid

------ Topic 9 ------
happy uplifted relaxed euphoric sweet



### Topic Distance Visualization

In [131]:
pyLDAvis.enable_notebook()

In [132]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
