In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Imports
# Natural Language Processing libraries, initiations and functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import re


# Preprocessing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word) for word 
                     in tokens if len(word) > 1 and not word in stop_words])



# Cvec, Standard
cvec = CountVectorizer(analyzer = "word",
                       min_df = 2,
                       preprocessor = preprocess,
                       stop_words = 'english') 
# Cvec DF
#df_words = pd.DataFrame(cvec.fit_transform(df['doc_column']).todense(), 
#                        columns=cvec.get_feature_names())

pd.read_csv('./data/housing.csv').head()

pd.read_csv('./data/housing-data.csv').head()

pd.read_csv('./data/Toy_boston.csv').head()

In [3]:
df_canna = pd.read_csv('./../../cannabis.csv')
df_canna.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
#df_canna['Starting_dex'] = df_canna.index
df_canna.dropna(inplace = True)

In [5]:
df_canna.reset_index(inplace = True)

In [6]:
df_canna['Type'] = df_canna['Type'].map( lambda x: {'indica': 1,
 'hybrid': 2,
 'sativa': 3}[x])

In [7]:
df_effects = pd.DataFrame(cvec.fit_transform(df_canna['Effects']).todense(), 
                        columns=cvec.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [8]:
df_flavors = pd.DataFrame(cvec.fit_transform(df_canna['Flavor']).todense(), 
                        columns=cvec.get_feature_names())

In [9]:
dict_E = {}
for col in df_effects.columns:
    dict_E[col] = ('E_' + col)

df_effects.rename(columns=dict_E, inplace = True)

In [10]:
dict_F = {}
for col in df_flavors.columns:
    dict_F[col] = ('F_' + col)

df_flavors.rename(columns=dict_F, inplace = True)

In [34]:
df = pd.concat([df_canna.loc[:,["Type", "Rating", "Starting_dex"]], df_effects
                #, df_flavors
               ], axis = 1)

In [35]:
print(df_canna.loc[:,["Type", "Rating", "Starting_dex"]].shape)
print(df_effects.shape)
print(df_flavors.shape)

df.shape

(2277, 3)
(2277, 13)
(2277, 50)


(2277, 16)

In [36]:
test_df.isna().sum()

Type            0
Rating          0
Starting_dex    0
E_aroused       0
E_creative      0
               ..
F_tree          0
F_tropical      0
F_vanilla       0
F_violet        0
F_woody         0
Length: 66, dtype: int64

In [37]:
df

Unnamed: 0,Type,Rating,Starting_dex,E_aroused,E_creative,E_energetic,E_euphoric,E_focused,E_giggly,E_happy,E_hungry,E_relaxed,E_sleepy,E_talkative,E_tingly,E_uplifted
0,2,4.0,0,0,1,1,1,0,0,0,0,1,0,0,1,0
1,2,4.7,1,1,1,1,0,0,0,1,0,1,0,0,0,0
2,3,4.4,2,0,1,1,0,0,0,1,0,1,0,0,0,1
3,2,4.2,3,0,1,0,0,0,0,0,1,1,0,0,1,1
4,2,4.6,4,0,0,0,1,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2272,2,4.7,2346,0,0,1,1,0,0,1,0,1,0,0,0,1
2273,1,4.6,2347,0,0,0,1,0,0,1,0,1,1,0,0,1
2274,1,5.0,2348,0,0,0,1,0,0,1,0,1,1,1,0,0
2275,1,4.4,2349,0,0,0,1,0,0,1,1,1,1,0,0,0


In [38]:
y = df['Type']
X = df.drop(['Type', 'Starting_dex'], axis = 1)

In [39]:
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y) 

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

print('Train:', model.score(X_train,y_train))
print('Test: ',  model.score(X_test,y_test))
np.average(cross_val_score(model, X, y, cv=3))

Train: 0.6362038664323374
Test:  0.6070175438596491


0.626701800614844

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y) 

model = RandomForestClassifier()
model.fit(X_train, y_train)

print('Train:', model.score(X_train,y_train))
print('Test: ',  model.score(X_test,y_test))
np.average(cross_val_score(model, X, y, cv=3))

Train: 0.8283538371411834
Test:  0.5666666666666667


0.55467720685112

In [42]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y) 

model = KNeighborsClassifier()
model.fit(X_train, y_train)

print('Train:', model.score(X_train,y_train))
print('Test: ',  model.score(X_test,y_test))
np.average(cross_val_score(model, X, y, cv=3))

Train: 0.6695957820738138
Test:  0.5719298245614035


0.5432586736934563

In [43]:
y_hat = model.predict(X_test)

In [44]:
pd.Series(y_hat).value_counts()

2    340
1    177
3     53
dtype: int64

In [45]:
df_canna['Type'].value_counts()

2    1169
1     680
3     428
Name: Type, dtype: int64