# Initiations

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Imports
# Natural Language Processing libraries, initiations and functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import re

# Preprocessing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word) for word 
                     in tokens if len(word) > 1 and not word in stop_words])

# Cvec, Standard
cvec = CountVectorizer(analyzer = "word",
                       min_df = 2,
                       preprocessor = preprocess,
                       stop_words = 'english') 
# Cvec DF
#df_words = pd.DataFrame(cvec.fit_transform(df['doc_column']).todense(), 
#                        columns=cvec.get_feature_names())

In [3]:
df_canna = pd.read_csv('./../../cannabis.csv')
df_canna.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


## General cleaning

In [4]:
df_canna.dropna(inplace = True)

In [5]:
df_canna.reset_index(inplace = True)

In [6]:
df_canna['Type'] = df_canna['Type'].map( lambda x: {'indica': 1,
 'hybrid': 2,
 'sativa': 3}[x])

In [7]:
df_effects = pd.DataFrame(cvec.fit_transform(df_canna['Effects']).todense(), 
                        columns=cvec.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [8]:
df_flavors = pd.DataFrame(cvec.fit_transform(df_canna['Flavor']).todense(), 
                        columns=cvec.get_feature_names())

In [9]:
dict_E = {}
for col in df_effects.columns:
    dict_E[col] = ('E_' + col)

df_effects.rename(columns=dict_E, inplace = True)

In [10]:
dict_F = {}
for col in df_flavors.columns:
    dict_F[col] = ('F_' + col)

df_flavors.rename(columns=dict_F, inplace = True)

In [14]:
df = pd.concat([df_canna.loc[:,["Type", "Rating"]], df_effects, df_flavors
               ], axis = 1)

In [None]:
df.to_csv("./data/cannabis_strains_cvec.csv", index= False)

## Meta DF Generation
 - Calculating Value totals rather than having Tableau do taht work in-program.

In [23]:
mask = df['Type'] == 1
series_indica = df[mask].sum()

In [24]:
mask = df['Type'] == 2
series_hybrid = df[mask].sum()

In [25]:
mask = df['Type'] == 3
series_sativa = df[mask].sum()

In [31]:
df_meta_type = pd.DataFrame([series_indica, series_hybrid, series_sativa], index= ['Indica', 'Hybrid', 'Sativa']).T

In [41]:
df_meta_type.loc['Type'] = df_meta_type.columns

In [45]:
df_meta_type.drop('Rating', inplace = True)

In [48]:
df_meta_type.T.to_csv('./data/canna_meta_type.csv')

In [56]:
df_meta_effect = pd.concat([df_canna.loc[:,["Type", "Rating"]], df_effects], axis = 1)

In [57]:
mask = df_meta_effect['Type'] == 1
series_indica = df_meta_effect[mask].sum()

mask = df_meta_effect['Type'] == 2
series_hybrid = df_meta_effect[mask].sum()

mask = df_meta_effect['Type'] == 3
series_sativa = df_meta_effect[mask].sum()

df_meta_effect = pd.DataFrame([series_indica, series_hybrid, series_sativa], index= ['Indica', 'Hybrid', 'Sativa']).T

df_meta_effect.loc['Type'] = df_meta_effect.columns

In [64]:
df_meta_effect.index = [x.strip('E_') for x in df_meta_effect.index]

In [66]:
df_meta_effect.T.to_csv('./data/canna_meta_effect.csv')

In [72]:
df['Type'].value_counts()

2    1169
1     680
3     428
Name: Type, dtype: int64

In [80]:
test_df = df_meta_effect.append(pd.Series({'Indica': 680,
           'Hybrid':1169,
           'Sativa':428}, name = 'Type_count')).drop('Type')

In [102]:
list_dicts = []
for col in test_df.columns:
    total = test_df.loc['Type_count',col]
    dict_col = {'Type':col}
    for row in test_df.index:
        dict_col[row.capitalize()] = test_df.loc[row,col]/total
    list_dicts.append(dict_col)
df_meta_percent = pd.DataFrame(list_dicts)

In [104]:
df_meta_percent.drop('Type_count', axis = 1, inplace = True)

In [105]:
df_meta_percent.to_csv('./data/canna_meta_percent.csv')