# Imports and basic set up

In [11]:
#In the begining, ther is time
import time

In [12]:
#Quality of life
#Some things we do generate a lot of warnings, and it just becomes clutter.
import warnings
warnings.filterwarnings("ignore")
#I like it when my notebook helps me out
%config IPCompleter.greedy=True
#Sometiems you just need to print pretty
from pprint import pprint
#Lots of these operations take many minutes to complete. So it behooves us to pickle the outputs and just unpickle them each time we re-open the notebook
import pickle


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
import pandas as pd
from bayes_opt import BayesianOptimization

Data is from: https://www.kaggle.com/datasets/toygarr/datasets-for-natural-language-processing

Dataset is from a collection of sentiment datasets, but we just want to play with the food one for now.

"ctweet, stweet, food" datasets are positive or negative analysis (sentiment) -> 0 negative -> 1 positive (ctweet has neutral 0, 1, 2)

we're assuming that any code we write to deal with this food data set will be extensible later by simply adding the rest of the data if we so choose. At the moment it's simply faster to work with a smaller subset of the data as we design the pipeline and it's displays.

## Utilities

In [14]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import ngrams
import matplotlib.pyplot as plt

## Corpus and Vectoring tools

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.cluster import DBSCAN
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import os
from sklearn.model_selection import train_test_split


# Load the data

In [16]:
test = pd.read_csv('C:/Users/Prathmun/Documents/Springboard Jupyter/Sentiment-Capstone/data/food/test.csv')
train = pd.read_csv('C:/Users/Prathmun/Documents/Springboard Jupyter/Sentiment-Capstone/data/food/train.csv')
frames = [train, test]
df = pd.concat(frames)

# Processing the text

### Tokenize text

In [17]:
df['tokenized'] = df.text.apply(word_tokenize)



In [18]:
df.head(2)

Unnamed: 0,text,Y,tokenized
0,i was put off at first by the green powder but...,1,"[i, was, put, off, at, first, by, the, green, ..."
1,these ginger chews are too good to be true i t...,1,"[these, ginger, chews, are, too, good, to, be,..."


### Remove Stop Words


In [19]:
stopwords = nltk.corpus.stopwords.words("english")

In [20]:
df['nostops'] = df.tokenized.apply(lambda  x: [item for item in x if item not in stopwords])

In [21]:
df.head(2)

Unnamed: 0,text,Y,tokenized,nostops
0,i was put off at first by the green powder but...,1,"[i, was, put, off, at, first, by, the, green, ...","[put, first, green, powder, bad, little, grain..."
1,these ginger chews are too good to be true i t...,1,"[these, ginger, chews, are, too, good, to, be,...","[ginger, chews, good, true, try, limit, one, d..."


### Stemming 
Might do lemmatization later, but stemming is simpler and more reliable and we're looking to get something working before we refine it. Lemmatization is a step we can experiment with in our refinement stages

We chose snowball because it's an older, and stable stemmer that incorporates improvements from the older stemmer algorithm Porter's real world experience
Snowball docs: https://www.nltk.org/api/nltk.stem.snowball.html
Article that informed our decision to choose Snowball: https://towardsdatascience.com/stemming-lemmatization-what-ba782b7c0bd8


It looks as if stemming can increase recall, even in short texts but can also cause problems. We're going to move forward with stemmed words for now, but again can return to this if we seek optimization tasks down the line
Source: https://stackoverflow.com/questions/47219389/compute-word-n-grams-on-original-text-or-after-lemma-stemming-process#:~:text=Computing%20word%20n%2Dgrams%20after,you%20want%20to%20do%20it.

In [22]:
stemmer = SnowballStemmer("english")
df['stemmed'] = df.nostops.apply(lambda x: [stemmer.stem(item) for item in x])

In [23]:
df.stemmed.head(2)

0    [put, first, green, powder, bad, littl, graini...
1    [ginger, chew, good, true, tri, limit, one, da...
Name: stemmed, dtype: object

In [24]:
df.shape

(363219, 5)

# Building Limited Vocabularies

In [25]:
nostops_text_neg = " ".join(" ".join(listo) for listo in df.nostops[df['Y'] == 0])

In [26]:
neg_list = nostops_text_neg.split(' ')
neg_set = set(neg_list)

In [27]:
nostops_text_pos = " ".join(" ".join(listo) for listo in df.nostops[df['Y'] == 1])

In [28]:
pos_list = nostops_text_pos.split(' ')
pos_set = set(pos_list)

In [29]:
print('neg list len = ' + str(len(neg_list)))
print('neg set len = ' + str(len(neg_set))) 

neg list len = 2334177
neg set len = 46886


In [30]:
print('pos list len = ' + str(len(pos_list)))
print('pos set len = ' + str(len(pos_set))) 

pos list len = 11470406
pos set len = 104877


## Ratio DF

## Extracting Negative Only words

In [31]:
neg_only_set = neg_set - pos_set
len(neg_only_set )

11892

### Neg only Count DF

In [32]:
#Building up the count for the neg only df so that we can order them to slice, so we can add them to the lexicon we're going to feed into the vocabulary of the tokenizer

neg_count = {}

for word in neg_list:
    if word in neg_only_set:
        if word not in neg_count.keys():
            
            neg_count[word] = 0
            
        neg_count[word] += 1
        
neg_count_df = pd.DataFrame(neg_count, index=[0])
                            
neg_only_count_df = neg_count_df.transpose()

len(neg_only_set)

11892

## Extracting Positive Only Words

In [33]:
pos_only_set =  pos_set - neg_set 
len(pos_only_set )

69883

### Pos Only Count Df

In [34]:
#Building up the count for the neg only df so that we can order them to slice, so we can add them to the lexicon we're going to feed into the vocabulary of the tokenizer

pos_count = {}

for word in pos_list:
    if word in pos_only_set:
        if word not in pos_count.keys():
            
            pos_count[word] = 0
            
        pos_count[word] += 1
        
pos_count_df = pd.DataFrame(pos_count, index=[0])
                            
pos_only_count_df = pos_count_df.transpose()

len(pos_only_set)

69883

### Extracting word ratios

Using set logic to snag words that only appear in both positive and negative polarity documents

In [35]:
shared_set = neg_set.intersection(pos_set)
len(shared_set)

34994

In [36]:
ratio_df =  pd.DataFrame(shared_set)

### Counting the negative appearences of our set of words

In [37]:
neg_count = {}

for word in neg_list:
    if word in shared_set:
        if word not in neg_count.keys():
            
            neg_count[word] = 0
            
        neg_count[word] += 1
        
neg_count_df = pd.DataFrame(neg_count, index=[0])
                            
neg_count_df = neg_count_df.transpose()

### Counting the positive appearences of our set of words

In [38]:
pos_count = {}

for word in pos_list:
    if word in shared_set:
        if word not in pos_count.keys():
            
            pos_count[word] = 0
            
        pos_count[word] += 1

pos_count_df = pd.DataFrame(pos_count, index=[0])

pos_count_df = pos_count_df.transpose()


In [39]:
### Stapling the pos/neg counts to the ratio df

In [40]:
ratio_df['neg_count'] = neg_count_df[0]
ratio_df['pos_count'] = pos_count_df[0]

### Getting ratiod

In [41]:
ratio_df['ratio'] = ratio_df[['neg_count','pos_count']].mean(axis=1)

# Function to slice the lexicons appropriately

In [42]:
def slicer(pos_only, neg_only, high_ratio):
    
    #slice pos
    pos_slice = list(pos_only_count_df.sort_values(by=[0], ascending=False).head(pos_only).index)
    print(len(pos_slice))
    #slice neg
    neg_slice = list(neg_only_count_df.sort_values(by=[0], ascending=False).head(neg_only).index)
    print(len(neg_slice))
    #slice high_ratio
    ratio_slice = list(ratio_df.sort_values(ascending=False, by='ratio').head(high_ratio+1).index)[1:]
    print(len(ratio_slice))
    
    
    #Join slices into one big list
    sliced_lexicons = [pos_slice,neg_slice,ratio_slice]
    sliced_lexicon = []
    for lexicon in sliced_lexicons:
        for word in lexicon:
            sliced_lexicon.append(word)
        
    #Return list
    return sliced_lexicon

# <span style="color:purple"> Clustering and Secondary EDA/Feature Engineering</span>

We played briefly with using DBSCAN, but it didn't converge at all, it gave us 360k categories

Thinking that instead of conventional clustering we're going to play with latent dirichlet allocation because it'll do a similar thing as K-means, but also provide us with a lot more information about the composition of those clusters.
Original Paper: https://ai.stanford.edu/~ang/papers/nips01-lda.pdf
Tutorial we used to help us get going: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
Docs for the gensim module we're doing the lda with: https://radimrehurek.com/gensim/models/ldamodel.html

In [43]:
stemmed_texts = list(df.stemmed)

In [44]:
#makes a dictionary mapping to unique IDs
id2word = Dictionary(stemmed_texts)


In [45]:
#makes a bag of words version of the Gensim Dictionary
corpus = [id2word.doc2bow(text) for text in stemmed_texts]


# Feature Engineering with LDA

## Topic Vectors

Documentation we're going to need to use to get at document relationships to topics: https://radimrehurek.com/gensim/models/ldamodel.html

Relevant stack overflow
https://stackoverflow.com/questions/43357247/get-document-topics-and-get-term-topics-in-gensim

In [46]:
import gensim

In [47]:
import scipy

In [48]:
def topic_model_and_dump(num_topics=100):

    print('Running ' + str(num_topics) + ' topic LDA')
    
    # # Train the model on the corpus.
    start = time.time()
    lda = LdaModel(corpus,  id2word=id2word, num_topics=num_topics)
    stop = time.time()
    print("training the model took " +str(stop-start) + " seconds to complete")

    start = time.time()
    #Stole the below code from
    #https://stackoverflow.com/questions/46574720/python-gensim-lda-add-the-topic-to-the-document-after-getting-the-topics
    all_topics = lda.get_document_topics(corpus, minimum_probability=0.0)
    all_topics_csr = gensim.matutils.corpus2csc(all_topics)
    all_topics_numpy = all_topics_csr.T.toarray()
    all_topics_df = pd.DataFrame(all_topics_numpy)
    stop = time.time()
    print("pulling the topcs took " +str(stop-start) + " seconds to complete")

    #Let's rename the topic columns so that we can identify them later on
    all_topics_df.index = df.index
    for each in all_topics_df.columns:
        all_topics_df['topic_' + str(each)] = all_topics_df[each]
        all_topics_df.drop(columns=[each], inplace=True)

    start = time.time()
    #return all_topics_df    
    topic_vector = scipy.sparse.csr_matrix(all_topics_df.values)
    pickle.dump(all_topics_df, open("topic_vector" +str(num_topics) + "_topics", 'wb'))
    stop = time.time()
    print('Succesfully saved ' + str(num_topics) + ' dataframe' + 'it took ' +str(stop-start) + ' seoconds to save')


In [60]:
# for value in [5,10,25,50,100,150,200,300,400,500,600]:
#      topic_model_and_dump(value)

# Process to map modeling times based on dimensionality

In [133]:
def recall_calculator(confuse):
    recall = 0
    tp = confuse[1][1]
    fn = confuse[1][0]
    if tp > 0 or tp ==1:
        recall = tp / (tp+fn)
    return recall

In [134]:
def grad_boosting_model(dicto, X_train, y_train, X_test, y_test):
    x_y = [X_train, y_train, X_test, y_test]
    grady_the_boosted = GradientBoostingClassifier(learning_rate=1.0 ,
                                                   n_estimators=50 ,
                                                   max_leaf_nodes=4 )
    
    
    #timing_buddy(grady_the_boosted, 'GradientBoostingClassifier', dicto, x_y)
    #Timethe training
    funky_time_start = time.time()
    grady_the_boosted.fit(X_train, y_train)
    #     model_to_fit.fit(X_train, np.ravel(y_train))
    funky_time_stop = time.time()
    funky_train_time = funky_time_stop - funky_time_start
    dicto['GradientBoostingClassifier']['training_time'] = funky_train_time
    
    # Calculate confusion matrix
    confuse = confusion_matrix(y_test, grady_the_boosted.predict(X_test))
    dicto['GradientBoostingClassifier']['confuse'] = confuse

    dicto['GradientBoostingClassifier']['ROC_AUC_Score'] = roc_auc_score(y_test, grady_the_boosted.predict_proba(X_test)[:, 1])
    

In [135]:
def model_aggregator(X_train, y_train, X_test, y_test):
    
    #Models to explore
    model_set = ['GradientBoostingClassifier']
    model_stats = {}
    for each in model_set:
        model_stats[each] ={'confuse' : [], 'training_time' : 0, 'ROC_AUC_Score': 0}   
    grad_boosting_model(model_stats, X_train, y_train, X_test, y_test)
    

    return model_stats



In [127]:
def grid_maker(grid_dict):

    for i in range(len(grid_dict['pos_only'])):
        num_topics = grid_dict['topics'][i]    
        
        print('pulling the ' +str(grid_dict['topics'][i]) + ' topic vector')
        start = time.time()
        topic_vector = pickle.load(open('topic_vector' +str(grid_dict['topics'][i]) + "_topics", 'rb'))
        topic_vector = scipy.sparse.csr_matrix(topic_vector.values)
        stop = time.time()
        print(topic_vector.shape)
        print('Succesfully loaded the topic vector it took ' +str(int(stop-start)) + ' seoconds to load')
        #Priming the quantity of depth we wish to pull from in our individual lexicons
        
        pos_only = grid_dict['pos_only'][i]
        neg_only = grid_dict['neg_only'][i]
        high_ratio = grid_dict['high_ratio'][i]
        
        #compile the actual lexicon we're going to pass to the vectorizer
        start = time.time()
        print('pos, neg, ratio quantities')
        lexicon = slicer(pos_only, neg_only, high_ratio)
        
        #tokenize/vectorize the text with the vocabulary we produced
        Vectorizer_Agent_mono = TfidfVectorizer(preprocessor=None, stop_words=stopwords, vocabulary=lexicon)
        vectored_no_stops_mono = Vectorizer_Agent_mono.fit_transform(df.text.values) 
        stop = time.time()
        print('Succesfully loaded the token vector it took ' +str(int(stop-start)) + ' seoconds to load')
        
        
        #Join together our topic and token vectors
        #(This one was tricky, set it up originally to use the sci.py hstack, but when moving the funciton I switched over to the
        #numpy hstack on accident, which doesn't handle sparse matrices easily. Spent like two days bamboozled here)
        start= time.time()
        print('Joining vectors')
        mono_and_topics = (vectored_no_stops_mono, topic_vector)
        X = hstack(mono_and_topics)
        stop = time.time()

        print('Succesfully joined the vectors it took ' +str(int(stop-start)) + ' seoconds to join')
        

            

        
        #NOthin' special needs to happen here. I just wanted my Y getting built within the function so I wouldn't have to track it down
        #if something breaks
        y= df.Y
        
        #ye-old splits
        
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        #Using my prebuilt function to train/test/score the model
        print('Training model')
        start= time.time()

        model_stats= model_aggregator(X_train, y_train, X_test, y_test)
        
        stop= time.time()
        print('Succesfully trained/tested the model it took ' +str(int(stop-start)) + ' seoconds to train/test')

        print('Saving model')
        start= time.time()
        pickle.dump( model_stats,
                    open(
                        'model_stats' +'_pos_' + str(pos_only) +'_neg_' + str(neg_only) + '_ratio_' + str(high_ratio) + '_topcis_' +str(num_topics) +'.p',
                        "wb" ) )
        stop= time.time()
        print('Succesfully saved the model results, it took ' +str(int(stop-start)) + ' seoconds to save')



In [137]:
pos_only_list= [100,50,100,50,50,50,25]
neg_only_list= [50,100,100,100,50,50,50]
high_ratio_list = [400,400,300,300,300,250,275]
topic_list= [50,50,100,150,200,250,250]

In [138]:
# pos_only_list= [50]
# neg_only_list= [100]
# high_ratio_list = [600-160]
# topic_list= [10]

In [139]:
grid_space = {'pos_only' : pos_only_list,
             'neg_only':neg_only_list,
             'high_ratio':high_ratio_list,
            'topics':topic_list}

In [140]:
grid_maker(grid_space)

pulling the 50 topic vector
(363219, 50)
Succesfully loaded the topic vector it took 1 seoconds to load
pos, neg, ratio quantities
100
50
400
Succesfully loaded the token vector it took 16 seoconds to load
Joining vectors
Succesfully joined the vectors it took 0 seoconds to join
Training model
Succesfully trained/tested the model it took 307 seoconds to train/test
Saving model
Succesfully saved the model results, it took 0 seoconds to save
pulling the 50 topic vector
(363219, 50)
Succesfully loaded the topic vector it took 1 seoconds to load
pos, neg, ratio quantities
50
100
400
Succesfully loaded the token vector it took 16 seoconds to load
Joining vectors
Succesfully joined the vectors it took 0 seoconds to join
Training model
Succesfully trained/tested the model it took 301 seoconds to train/test
Saving model
Succesfully saved the model results, it took 0 seoconds to save
pulling the 100 topic vector
(363219, 100)
Succesfully loaded the topic vector it took 2 seoconds to load
pos, n

In [141]:
# AFTER THE FACT DUMMY MAKER

In [146]:
from sklearn.dummy import DummyClassifier


In [151]:
def classy_dummy(dicto, X_train, y_train, X_test, y_test):
    print('dumbo')
    dummy_clf = DummyClassifier(strategy="stratified")
    
    
    funky_time_start = time.time()
    dummy_clf.fit(X_train, y_train)
    #     model_to_fit.fit(X_train, np.ravel(y_train))
    funky_time_stop = time.time()
    funky_train_time = funky_time_stop - funky_time_start
    
    dicto['dummy']['training_time'] = funky_train_time
    
    # Calculate confusion matrix
    confuse = confusion_matrix(y_test, dummy_clf.predict(X_test))
    dicto['dummy']['confuse'] = confuse

    
    dicto['dummy']['ROC_AUC_Score'] = roc_auc_score(y_test, dummy_clf.predict_proba(X_test)[:, 1])

In [152]:
def model_aggregator(X_train, y_train, X_test, y_test):
    
    #Models to explore
    model_set = ['dummy']
    model_stats = {}
    for each in model_set:
        model_stats[each] ={'confuse' : [], 'training_time' : 0, 'ROC_AUC_Score': 0}   
    classy_dummy(model_stats, X_train, y_train, X_test, y_test)
    

    return model_stats



In [153]:
def grid_maker(grid_dict):

    for i in range(len(grid_dict['pos_only'])):
        num_topics = grid_dict['topics'][i]    
        
        print('pulling the ' +str(grid_dict['topics'][i]) + ' topic vector')
        start = time.time()
        topic_vector = pickle.load(open('topic_vector' +str(grid_dict['topics'][i]) + "_topics", 'rb'))
        topic_vector = scipy.sparse.csr_matrix(topic_vector.values)
        stop = time.time()
        print(topic_vector.shape)
        print('Succesfully loaded the topic vector it took ' +str(int(stop-start)) + ' seoconds to load')
        #Priming the quantity of depth we wish to pull from in our individual lexicons
        
        pos_only = grid_dict['pos_only'][i]
        neg_only = grid_dict['neg_only'][i]
        high_ratio = grid_dict['high_ratio'][i]
        
        #compile the actual lexicon we're going to pass to the vectorizer
        start = time.time()
        print('pos, neg, ratio quantities')
        lexicon = slicer(pos_only, neg_only, high_ratio)
        
        #tokenize/vectorize the text with the vocabulary we produced
        Vectorizer_Agent_mono = TfidfVectorizer(preprocessor=None, stop_words=stopwords, vocabulary=lexicon)
        vectored_no_stops_mono = Vectorizer_Agent_mono.fit_transform(df.text.values) 
        stop = time.time()
        print('Succesfully loaded the token vector it took ' +str(int(stop-start)) + ' seoconds to load')
        
        
        #Join together our topic and token vectors
        #(This one was tricky, set it up originally to use the sci.py hstack, but when moving the funciton I switched over to the
        #numpy hstack on accident, which doesn't handle sparse matrices easily. Spent like two days bamboozled here)
        start= time.time()
        print('Joining vectors')
        mono_and_topics = (vectored_no_stops_mono, topic_vector)
        X = hstack(mono_and_topics)
        stop = time.time()

        print('Succesfully joined the vectors it took ' +str(int(stop-start)) + ' seoconds to join')
        

            

        
        #NOthin' special needs to happen here. I just wanted my Y getting built within the function so I wouldn't have to track it down
        #if something breaks
        y= df.Y
        
        #ye-old splits
        
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        #Using my prebuilt function to train/test/score the model
        print('Training model')
        start= time.time()

        model_stats= model_aggregator(X_train, y_train, X_test, y_test)
        
        stop= time.time()
        print('Succesfully trained/tested the model it took ' +str(int(stop-start)) + ' seoconds to train/test')

        print('Saving model')
        start= time.time()
        pickle.dump( model_stats,
                    open(
                        'dummy_stats' +'_pos_' + str(pos_only) +'_neg_' + str(neg_only) + '_ratio_' + str(high_ratio) + '_topcis_' +str(num_topics) +'.p',
                        "wb" ) )
        stop= time.time()
        print('Succesfully saved the model results, it took ' +str(int(stop-start)) + ' seoconds to save')



In [154]:
grid_maker(grid_space)

pulling the 50 topic vector
(363219, 50)
Succesfully loaded the topic vector it took 1 seoconds to load
pos, neg, ratio quantities
100
50
400
Succesfully loaded the token vector it took 16 seoconds to load
Joining vectors
Succesfully joined the vectors it took 0 seoconds to join
Training model
dumbo
Succesfully trained/tested the model it took 0 seoconds to train/test
Saving model
Succesfully saved the model results, it took 0 seoconds to save
pulling the 50 topic vector
(363219, 50)
Succesfully loaded the topic vector it took 1 seoconds to load
pos, neg, ratio quantities
50
100
400
Succesfully loaded the token vector it took 16 seoconds to load
Joining vectors
Succesfully joined the vectors it took 0 seoconds to join
Training model
dumbo
Succesfully trained/tested the model it took 0 seoconds to train/test
Saving model
Succesfully saved the model results, it took 0 seoconds to save
pulling the 100 topic vector
(363219, 100)
Succesfully loaded the topic vector it took 3 seoconds to loa