In [22]:
# Importing modules

import pandas as pd
import numpy as np
import time
import re
from pprint import pprint
import joblib

#NLP
import sklearn
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import guidedlda

#plotting
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns


# Preprocessing

In [4]:
df = pd.read_csv('highest_cleaned').drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,content,top_1_score,top_2_score,Corrected_content,word_counts,lang,average_score
0,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,en,3.5
1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,en,3.5
2,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,en,3.5
3,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,en,3.5
4,The study activity that I found the most helpf...,3,4,The study activity that I found the most helpf...,397,en,3.5


In [5]:
def get_stopwords():
    stop_words = stopwords.words('english')
    stop_words.extend(['mr', 'mrs', 'miss', 'ms', 'ahh', 'ah', 'want', 'feel', 'want', 'goal', 'ela', 'go', 'get', 'like','grade', 'use', 'make', 
                  'next', 'well', 'lea', 'also', 'thing', 'one', 'try', 'end', 'turn', 'work', 'math', 'try', 'sol', 'science','week', 'would',
                 'class', 'need', 'exit', 'ticket', 'sure', 'strategy', 'exit','grade', 'good', 'best', 'able', 'lot', 'think', 'help',
                'could', 'really', 'improve', 'time'])
    return stop_words


In [12]:
def get_wordnet_pos(word):
    '''
    tags parts of speech to tokens
    Expects a string and outputs the string and its part of speech
    '''
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def word_lemmatizer(text):
    '''
    lemamtizes the tokens based on their part of speech
    Expects a lits of tokens and outputs a list of lemmatized tokens
    '''
    
    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text, get_wordnet_pos(text))
    return text


def reflection_tokenizer(text):
    '''
    Tokenizes a list of string, expects a list of strings and outputs a list of strings.
    before tokenizing:
    1)removes the non-alphanumeric charcaters like emojies
    2)removes the numbers
    3)lower cases the words
    4)tokenizes the sentences
    5)lemmatizes teh tokens
    6)removes the tokens in stop words list
     '''

    text=re.sub(r'[\W_]+', ' ', text) #keeps just alphnumeric characters
    text=re.sub(r'\d+', '', text) #removes numbers
    text = text.lower()
    tokens = [word for word in word_tokenize(text)]
    tokens = [word for word in tokens if len(word) >= 3]#removes smaller than 3 character
    tokens = [word_lemmatizer(w) for w in tokens]
    tokens = [s for s in tokens if s not in get_stopwords()]
    return tokens

In [13]:
df['lemmatize_token'] = df.Corrected_content.apply(reflection_tokenizer)

In [14]:
df.head()

Unnamed: 0,content,top_1_score,top_2_score,Corrected_content,word_counts,lang,average_score,lemmatize_token
0,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,en,3.5,"[understand, sense, note, understand, detailed..."
1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,en,3.5,"[course, six, constantly, pretty, similar, stu..."
2,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,en,3.5,"[improvement, expand, idea, write, regularly, ..."
3,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,en,3.5,"[knowledge, check, quiz, prepared, exam, learn..."
4,The study activity that I found the most helpf...,3,4,The study activity that I found the most helpf...,397,en,3.5,"[study, activity, found, helpful, review, know..."


In [27]:
df.to_csv('data_lemmatized')

In [17]:
#Convert a collection of text documents to a matrix of token counts, matrix of documents and tokens
token_vectorizer = CountVectorizer(tokenizer = reflection_tokenizer, min_df=10, stop_words=get_stopwords(), ngram_range=(1, 4))                        

In [18]:
#function maps the column of the dataframe to a matrix of documents in the rows and token counts as columns,
# this is bag of words representation of the documents
X_ngrams = token_vectorizer.fit_transform(df.Corrected_content) 

  'stop_words.' % sorted(inconsistent))


In [19]:
type(X_ngrams)

scipy.sparse.csr.csr_matrix

In [21]:
X_ngrams.shape

(23150, 7317)

# Guided LDA

In [28]:
#list of terms (words or n_grams of words)
tf_feature_names = token_vectorizer.get_feature_names()

In [30]:
#len(tf_feature_names)

In [37]:
#dictionary of words frequency
word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))

In [38]:
#len(word2id)

In [40]:
#keywrods for seeded topics that I want GuidedLDA model to converge to 
seed_topic_list_6 = [['take', 'note', 'compare', 'classmate', 'highlight', 'underline', 'jot', 'write', 'topic', 'main', 'complete', 'point', 'copy', 'slide'],
                   ['read', 'study','review', 'skim', 'textbook', 'compare', 'note','connect', 'sketch', 'summarize', 'relationship', 'map', 'concept', 'diagram', 'chart'],
                   ['question', 'essay','assignment', 'exam', 'test', 'quiz', 'answer', 'practice', 'review', 'repeat', 'strength', 'weak', 'solve', 'problem', 'identify'],
                   ['plan', 'calendar', 'task', 'list', 'manage', 'procrastinate', 'due','stress', 'manage', 'anxiety', 'express', 'break', 'sleep', 'nap', 'eat', 'exercise'],
                   ['group','partner', 'classmate', 'brainstorm', 'ask', 'answer', 'verify', 'peer', 'teach', 'clarify'],
                   ['ask','aid', 'resource', 'teacher', 'tutor', 'peer', 'verify', 'explain', 'clear', 'talk']]


In [93]:
#Instantiate the guidedlda as model with parameters like number of topics 
model = guidedlda.GuidedLDA(n_topics=6, n_iter=100, random_state=7, refresh=10)
#seed_topics is the dictionary {word_id to topic_id}
seed_topics = {}
for t_id, st in enumerate(seed_topic_list_6):
    for word in st:
        seed_topics[word2id[word]] = t_id

#build the model on the dataset with 6 topics
#tseed_confidence: how much extra boost should be given to a term between 0 to 1
model.fit(X_ngrams, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 23150
INFO:guidedlda:vocab_size: 7317
INFO:guidedlda:n_words: 494675
INFO:guidedlda:n_topics: 6
INFO:guidedlda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -5252196
INFO:guidedlda:<10> log likelihood: -3942594
INFO:guidedlda:<20> log likelihood: -3800728
INFO:guidedlda:<30> log likelihood: -3749330
INFO:guidedlda:<40> log likelihood: -3721371
INFO:guidedlda:<50> log likelihood: -3702690
INFO:guidedlda:<60> log likelihood: -3689292
INFO:guidedlda:<70> log likelihood: -3678675
INFO:guidedlda:<80> log likelihood: -3671903
INFO:guidedlda:<90> log likelihood: -3666396
INFO:guidedlda:<99> log likelihood: -3663414


<guidedlda.guidedlda.GuidedLDA at 0x1352556a0>

In [44]:

joblib.dump(model, "Guided_LDA_6topics.pkl")

['Guided_LDA_6topics.pkl']

In [94]:
n_top_words = 15
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
     topic_words = np.array(tf_feature_names)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
     print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: write book read reading idea story essay paragraph word people main add point evidence start
Topic 1: read study note test question take quiz reading answer look understand book word know score
Topic 2: question quiz study check understand note answer problem knowledge ask mistake practice take knowledge check look
Topic 3: finish homework assignment complete day school reflection study start project something keep last home win
Topic 4: answer question understand learn find know problem way lab look write easy see number put
Topic 5: ask finish teacher talk school attention stay know focus pay extra pay attention homework keep test


In [None]:
dics = {topic 3:'finish homework and complete assignment',
topic 2:'check past quizzes and questions and understand answers',
topic 5:'talking and asking teacher and pay attention',
topic 1:'read, study notes and books',
topic 4:'answering questions and understand and learn the problems',
topic 0:'write story, essay and books'}

In [95]:
doc_topic = model.transform(X_ngrams)
print(doc_topic)


  if sparse and not np.issubdtype(doc_word.dtype, int):


[[4.10817190e-03 3.62491317e-02 6.12804332e-01 6.97971179e-03
  2.98556991e-01 4.13016617e-02]
 [3.19106859e-04 7.85588105e-02 7.02524670e-01 1.56446752e-02
  2.02307714e-01 6.45023218e-04]
 [9.66845930e-01 7.31613319e-05 4.14169033e-05 3.28841281e-02
  4.11003918e-05 1.14263566e-04]
 ...
 [6.45386207e-01 7.97752399e-03 1.73387809e-02 2.28949904e-03
  2.63954147e-03 3.24368448e-01]
 [9.37838023e-01 5.85436764e-02 2.31894839e-04 1.56189903e-03
  1.59037317e-03 2.34133890e-04]
 [3.97658885e-03 7.73764124e-01 2.20293856e-01 3.62350147e-04
  7.01562871e-04 9.01518172e-04]]


In [97]:
doc_topic[1]

array([3.19106859e-04, 7.85588105e-02, 7.02524670e-01, 1.56446752e-02,
       2.02307714e-01, 6.45023218e-04])

In [98]:
columns_label = ['topic {}'.format(i) for i in range(6)]  # number of topics
topic_vector = pd.DataFrame(doc_topic, columns = columns_label)#dataframe of doc-topics
topic_vector.round(2).head(10)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5
0,0.0,0.04,0.61,0.01,0.3,0.04
1,0.0,0.08,0.7,0.02,0.2,0.0
2,0.97,0.0,0.0,0.03,0.0,0.0
3,0.05,0.09,0.79,0.0,0.04,0.02
4,0.0,0.01,0.91,0.0,0.08,0.0
5,0.0,0.06,0.92,0.0,0.02,0.0
6,0.25,0.0,0.0,0.0,0.59,0.16
7,0.06,0.0,0.54,0.16,0.24,0.0
8,0.28,0.0,0.0,0.72,0.0,0.0
9,0.0,0.0,0.13,0.83,0.02,0.03


In [119]:
def topic_threshold(doc_topic, topic_vector):
    
    """
    Return the topic number if the probablity of a topic being in a document is more than value
    """
    
    topic_num_list = []
    for i in range(len(topic_vector)):
        topic_num = [idx for idx, value in enumerate(doc_topic[i]) if value >= 0.25]
        if topic_num != []:
            topic_num = topic_num
        else:
            topic_num = 'None'
        topic_num_list.append(topic_num)
    return topic_num_list


In [120]:
num_topic=topic_threshold(doc_topic, topic_vector)
#print(num_topic)

In [122]:
df_doc_topic = pd.DataFrame({'topics': num_topic, 'reflection': df.Corrected_content})

In [138]:
#df_doc_topic.loc[:50]
#df_doc_topic.reflection[43]
#df_doc_topic.topics[43]

In [135]:
panel = pyLDAvis.sklearn.prepare(model, X_ngrams, token_vectorizer, mds='tsne')
panel

  if sparse and not np.issubdtype(doc_word.dtype, int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [143]:
pyLDAvis.save_html(panel, 'GuidedLDA_6topics')

In [None]:
pyLDAvis.show(panel)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [14/Oct/2019 20:48:50] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Oct/2019 20:48:50] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Oct/2019 20:48:50] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Oct/2019 20:48:50] "GET /LDAvis.js HTTP/1.1" 200 -


In [1]:
import os

In [2]:
os.getcwd()

'/Users/shahrzad/Desktop/Insight_Project_SHV/notebook'

In [3]:
!ls

Data_cleaning_and_splitting.ipynb
Gensim LDA_BOW_TFIDF_with visualization.ipynb
Guided LDA_6topics-4-grams.ipynb
LDA_GridSearch.ipynb
POS_tagger.ipynb
basic_EDA.ipynb
