In [23]:
#Importing packages and modules
import pandas as pd
from pprint import pprint

#NLP
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from gensim import corpora, models


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shahrzad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('highest_cleaned')

In [4]:
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text
documents.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,content,index
0,I understand sense we do these notes it hel...,0
1,"Over the course of the six weeks, I was consta...",1
2,I feel like I made my best improvements in exp...,2
3,*The knowledge checks and quizzes prepared me...,3
4,The study activity that I found the most helpf...,4


# NLP preprocessing # 

This section includes all the necessary NLP preprocessing works before topic modeling

In [5]:
def lemmatize_stemming(text):
    '''
    Create a new instance of a language specific subclass; here English
    '''
    stemmer = SnowballStemmer('english')
    #Lemmatize Single Word with the POS tag as VERB
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    '''
    gensim preprocess which processed documents split by whitespace, removes stopwords, 
    strips numbers and multipel whitrspace, and short forms.
    tokens with less than 3 characters are removed.
    input: a list of tokens
    output: a list of tokens
    '''
    
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
#tpreview a documnet with index 352: 
doc_sample = documents[documents['index'] == 352].values[0][0]
doc_sample

"I think I did okay. I was hoping for an A in the class but it is really hard. I really want to try to be more orginized next 6 weeks. Doing my notes online was really helpful to me. My favorite stratagy was working with Ameila. I feel like she is a great influence on me and is really smart. I also think that she needed me. I am really happy to have helped her. I really like the way your classroom is laid out. It's really cozy and comfertable. The only thing I would recomend is at the begining of the class when we review the try it out, not spending so much time one it. I feel like sometimes I am rushed and It could help me more if we had a little more time. I recomend talking tothe people who didn't understand or want to have extra practice in a small group at the beggining of class. This is just my opinion but I think It would help me be more orinized and less overwhelmed."

In [13]:
#preview the result of preprocessing on documnet 352 
doc_sample = documents[documents['index'] == 352].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['I', 'think', 'I', 'did', 'okay.', 'I', 'was', 'hoping', 'for', 'an', 'A', 'in', 'the', 'class', 'but', 'it', 'is', 'really', 'hard.', 'I', 'really', 'want', 'to', 'try', 'to', 'be', 'more', 'orginized', 'next', '6', 'weeks.', 'Doing', 'my', 'notes', 'online', 'was', 'really', 'helpful', 'to', 'me.', 'My', 'favorite', 'stratagy', 'was', 'working', 'with', 'Ameila.', 'I', 'feel', 'like', 'she', 'is', 'a', 'great', 'influence', 'on', 'me', 'and', 'is', 'really', 'smart.', 'I', 'also', 'think', 'that', 'she', 'needed', 'me.', 'I', 'am', 'really', 'happy', 'to', 'have', 'helped', 'her.', 'I', 'really', 'like', 'the', 'way', 'your', 'classroom', 'is', 'laid', 'out.', "It's", 'really', 'cozy', 'and', 'comfertable.', 'The', 'only', 'thing', 'I', 'would', 'recomend', 'is', 'at', 'the', 'begining', 'of', 'the', 'class', 'when', 'we', 'review', 'the', 'try', 'it', 'out,', 'not', 'spending', 'so', 'much', 'time', 'one', 'it.', 'I', 'feel', 'like', 'sometimes', 'I', 'am', 'rus

In [14]:
#Preprocess function is mapped to a column of the dataframe
processed_docs = documents['content'].map(preprocess)
processed_docs[:5]

0    [understand, sens, note, help, understand, cla...
1    [cours, week, constant, pretti, similar, studi...
2    [feel, like, best, improv, expand, idea, write...
3    [knowledg, check, quiz, prepar, exam, help, le...
4    [studi, activ, help, review, knowledg, check, ...
Name: content, dtype: object

In [15]:
#Create a dictionary from ‘processed_docs’ containing the number of times 
#a word appears in the training set (bag of words)

dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 actual
1 answer
2 ask
3 busi
4 call
5 card
6 choic
7 choos
8 class
9 collin
10 come


In [16]:
#Filter out tokens that appear in
#less than 15 documents (absolute number) or
#more than 0.5 documents (fraction of total corpus size, not absolute number)
#after the above two steps, keep only the first 100000 most frequent tokens

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [19]:
#For each document we create a dictionary reporting how many
#words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[352]

[(8, 3),
 (19, 2),
 (29, 1),
 (33, 4),
 (42, 3),
 (56, 1),
 (59, 1),
 (63, 1),
 (76, 1),
 (81, 1),
 (84, 3),
 (85, 2),
 (87, 1),
 (93, 2),
 (96, 1),
 (97, 1),
 (137, 1),
 (144, 1),
 (146, 1),
 (161, 1),
 (167, 1),
 (177, 1),
 (189, 1),
 (211, 1),
 (218, 1),
 (275, 1),
 (291, 1),
 (304, 1),
 (316, 1),
 (493, 1),
 (524, 1),
 (536, 1),
 (548, 1),
 (677, 1),
 (866, 1),
 (904, 1),
 (1143, 1)]

In [22]:
#Preview Bag Of Words for our sample preprocessed document
bow_doc_352 = bow_corpus[352]
for i in range(len(bow_doc_352)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_352[i][0], 
                                               dictionary[bow_doc_352[i][0]], 
bow_doc_352[i][1]))

Word 8 ("class") appears 3 time.
Word 19 ("feel") appears 2 time.
Word 29 ("group") appears 1 time.
Word 33 ("help") appears 4 time.
Word 42 ("like") appears 3 time.
Word 56 ("note") appears 1 time.
Word 59 ("peopl") appears 1 time.
Word 63 ("practic") appears 1 time.
Word 76 ("spend") appears 1 time.
Word 81 ("talk") appears 1 time.
Word 84 ("think") appears 3 time.
Word 85 ("time") appears 2 time.
Word 87 ("understand") appears 1 time.
Word 93 ("want") appears 2 time.
Word 96 ("week") appears 1 time.
Word 97 ("work") appears 1 time.
Word 137 ("littl") appears 1 time.
Word 144 ("need") appears 1 time.
Word 146 ("onlin") appears 1 time.
Word 161 ("review") appears 1 time.
Word 167 ("small") appears 1 time.
Word 177 ("thing") appears 1 time.
Word 189 ("great") appears 1 time.
Word 211 ("extra") appears 1 time.
Word 218 ("influenc") appears 1 time.
Word 275 ("hard") appears 1 time.
Word 291 ("begin") appears 1 time.
Word 304 ("rush") appears 1 time.
Word 316 ("classroom") appears 1 time.

In [24]:
#tf-idf model object using models.TfidfModel on ‘bow_corpus’ saved as 'tfidf'
tfidf = models.TfidfModel(bow_corpus)

#transformation applied to the entire corpus saved as ‘corpus_tfidf’
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.06065422049025359),
 (1, 0.21883212448910405),
 (2, 0.09635189417783413),
 (3, 0.08598607733885608),
 (4, 0.09931369990026244),
 (5, 0.08190961482996025),
 (6, 0.15546786835632248),
 (7, 0.0695104604240546),
 (8, 0.05288944970184506),
 (9, 0.09120727674329104),
 (10, 0.1059850647054033),
 (11, 0.15000131420669585),
 (12, 0.06838890421805661),
 (13, 0.053677668314918936),
 (14, 0.08484351983710177),
 (15, 0.06973494467616116),
 (16, 0.06565848216726533),
 (17, 0.13160225574484968),
 (18, 0.08326260999532088),
 (19, 0.0371420609226974),
 (20, 0.03661156193330024),
 (21, 0.12692720990192663),
 (22, 0.07709330701702832),
 (23, 0.1726889341390777),
 (24, 0.06234736294038206),
 (25, 0.09086395665796909),
 (26, 0.02913671139644564),
 (27, 0.052212184720177225),
 (28, 0.2055272171213383),
 (29, 0.12594366907368168),
 (30, 0.07581570153396927),
 (31, 0.07487113091356741),
 (32, 0.09220188217796893),
 (33, 0.10072332783349318),
 (34, 0.05015584039473292),
 (35, 0.06060147968070396),
 (36,

# Running LDA using Bag of Words

In [25]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)

In [26]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.034*"studi" + 0.029*"help" + 0.029*"note" + 0.028*"time" + 0.025*"test" + 0.022*"question" + 0.021*"understand" + 0.020*"think" + 0.017*"quiz" + 0.015*"answer"
Topic: 1 
Words: 0.058*"work" + 0.042*"grade" + 0.032*"time" + 0.021*"strategi" + 0.020*"question" + 0.020*"studi" + 0.019*"class" + 0.018*"check" + 0.018*"help" + 0.017*"best"
Topic: 2 
Words: 0.051*"goal" + 0.028*"work" + 0.022*"grade" + 0.021*"class" + 0.017*"time" + 0.015*"math" + 0.015*"better" + 0.014*"week" + 0.014*"homework" + 0.014*"finish"
Topic: 3 
Words: 0.047*"week" + 0.047*"work" + 0.035*"class" + 0.033*"grade" + 0.030*"goal" + 0.026*"need" + 0.022*"time" + 0.020*"go" + 0.019*"finish" + 0.016*"want"
Topic: 4 
Words: 0.036*"read" + 0.018*"think" + 0.016*"book" + 0.014*"go" + 0.013*"time" + 0.012*"number" + 0.012*"error" + 0.011*"like" + 0.011*"differ" + 0.010*"know"
Topic: 5 
Words: 0.063*"read" + 0.032*"write" + 0.031*"time" + 0.027*"question" + 0.027*"work" + 0.025*"answer" + 0.021*"strategi" + 


# Running LDA using TF-IDF

In [27]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=6, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.030*"read" + 0.014*"time" + 0.014*"question" + 0.011*"answer" + 0.011*"strategi" + 0.011*"work" + 0.009*"understand" + 0.009*"passag" + 0.009*"week" + 0.009*"help"
Topic: 1 Word: 0.018*"write" + 0.009*"work" + 0.009*"think" + 0.008*"time" + 0.008*"read" + 0.008*"need" + 0.008*"thing" + 0.008*"studi" + 0.007*"note" + 0.007*"help"
Topic: 2 Word: 0.016*"goal" + 0.016*"grade" + 0.013*"week" + 0.012*"test" + 0.012*"class" + 0.012*"work" + 0.009*"homework" + 0.008*"time" + 0.008*"complet" + 0.008*"need"
Topic: 3 Word: 0.020*"grade" + 0.013*"want" + 0.013*"class" + 0.011*"work" + 0.010*"goal" + 0.010*"week" + 0.010*"improv" + 0.009*"studi" + 0.009*"go" + 0.008*"need"
Topic: 4 Word: 0.017*"goal" + 0.014*"week" + 0.012*"work" + 0.012*"grade" + 0.011*"math" + 0.010*"finish" + 0.010*"class" + 0.009*"note" + 0.009*"help" + 0.009*"need"
Topic: 5 Word: 0.015*"question" + 0.014*"answer" + 0.011*"quiz" + 0.010*"help" + 0.010*"check" + 0.010*"time" + 0.010*"note" + 0.010*"studi" + 0.00

# Performance evaluation by classifying sample document 
# using LDA Bag of Words model

In [28]:
processed_docs[352]

['think',
 'okay',
 'hop',
 'class',
 'hard',
 'want',
 'orgin',
 'week',
 'note',
 'onlin',
 'help',
 'favorit',
 'stratagi',
 'work',
 'ameila',
 'feel',
 'like',
 'great',
 'influenc',
 'smart',
 'think',
 'need',
 'happi',
 'help',
 'like',
 'classroom',
 'lay',
 'cozi',
 'comfert',
 'thing',
 'recomend',
 'begin',
 'class',
 'review',
 'spend',
 'time',
 'feel',
 'like',
 'rush',
 'help',
 'littl',
 'time',
 'recomend',
 'talk',
 'toth',
 'peopl',
 'understand',
 'want',
 'extra',
 'practic',
 'small',
 'group',
 'beggin',
 'class',
 'opinion',
 'think',
 'help',
 'orin',
 'overwhelm']

In [29]:
for index, score in sorted(lda_model[bow_corpus[352]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6424310207366943	 
Topic: 0.047*"week" + 0.047*"work" + 0.035*"class" + 0.033*"grade" + 0.030*"goal" + 0.026*"need" + 0.022*"time" + 0.020*"go" + 0.019*"finish" + 0.016*"want"

Score: 0.34409475326538086	 
Topic: 0.034*"studi" + 0.029*"help" + 0.029*"note" + 0.028*"time" + 0.025*"test" + 0.022*"question" + 0.021*"understand" + 0.020*"think" + 0.017*"quiz" + 0.015*"answer"


In [30]:
documents.content[352]

"I think I did okay. I was hoping for an A in the class but it is really hard. I really want to try to be more orginized next 6 weeks. Doing my notes online was really helpful to me. My favorite stratagy was working with Ameila. I feel like she is a great influence on me and is really smart. I also think that she needed me. I am really happy to have helped her. I really like the way your classroom is laid out. It's really cozy and comfertable. The only thing I would recomend is at the begining of the class when we review the try it out, not spending so much time one it. I feel like sometimes I am rushed and It could help me more if we had a little more time. I recomend talking tothe people who didn't understand or want to have extra practice in a small group at the beggining of class. This is just my opinion but I think It would help me be more orinized and less overwhelmed."

# Performance evaluation by classifying sample document 
# using LDA TF-IDF model.

In [31]:
for index, score in sorted(lda_model_tfidf[bow_corpus[352]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5224055647850037	 
Topic: 0.020*"grade" + 0.013*"want" + 0.013*"class" + 0.011*"work" + 0.010*"goal" + 0.010*"week" + 0.010*"improv" + 0.009*"studi" + 0.009*"go" + 0.008*"need"

Score: 0.46277114748954773	 
Topic: 0.030*"read" + 0.014*"time" + 0.014*"question" + 0.011*"answer" + 0.011*"strategi" + 0.011*"work" + 0.009*"understand" + 0.009*"passag" + 0.009*"week" + 0.009*"help"


In [32]:
unseen_document = 'The strategies that worked best was putting the information onto flash cards and learning different points for different key words.'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.533930778503418	 Topic: 0.051*"goal" + 0.028*"work" + 0.022*"grade" + 0.021*"class" + 0.017*"time"
Score: 0.3851521611213684	 Topic: 0.036*"read" + 0.018*"think" + 0.016*"book" + 0.014*"go" + 0.013*"time"
Score: 0.04204772040247917	 Topic: 0.063*"read" + 0.032*"write" + 0.031*"time" + 0.027*"question" + 0.027*"work"
Score: 0.012975064106285572	 Topic: 0.058*"work" + 0.042*"grade" + 0.032*"time" + 0.021*"strategi" + 0.020*"question"
Score: 0.012970726937055588	 Topic: 0.034*"studi" + 0.029*"help" + 0.029*"note" + 0.028*"time" + 0.025*"test"
Score: 0.0129235303029418	 Topic: 0.047*"week" + 0.047*"work" + 0.035*"class" + 0.033*"grade" + 0.030*"goal"


In [33]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [34]:
vis_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [35]:
vis_data = gensimvis.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
