In [187]:
# Importing modules

import pandas as pd
from pandas.api.types import is_string_dtype
import re
import sklearn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from nltk.corpus import wordnet
import numpy as np
from gensim.models import TfidfModel
from gensim import similarities
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import time

# Preprocessing

In [188]:
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

In [189]:
df = pd.read_csv('highest_cleaned').drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,content,top_1_score,top_2_score,Corrected_content,word_counts,average_score
0,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,3.5
1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,3.5
2,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,3.5
3,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,3.5
4,The study activity that I found the most helpf...,3,4,The study activity that I found the most helpf...,397,3.5


In [190]:
stop_words = stopwords.words('english')

stop_words.extend(['mr', 'mrs', 'miss', 'ms', 'ahh', 'ah', 'want', 'feel', 'want', 'goal', 'ela', 'go', 'get', 'like'])

In [191]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [193]:
def get_wordnet_pos(word):
    '''tags parts of speech to tokens
    Expects an string and outputs the strng and its part of speech'''
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def word_lemmatizer(text):
    '''lemamtizes the tokens based on their part of speech'''
    
    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text, get_wordnet_pos(text))
    return text
def reflection_tokenizer(text):
    '''expects a string '''
    text=re.sub(r'[\W_]+', ' ', text) #keeps just alphnumeric characters
    text=re.sub(r'\d+', '', text) #removes numbers
    text = text.lower()
    tokens = [word for word in word_tokenize(text)]
    tokens = [word for word in tokens if len(word) >= 3]#removes smaller than 3 character
    tokens = [word_lemmatizer(w) for w in tokens]
    tokens = [s for s in tokens if s not in stop_words]
    return tokens

In [194]:
df['lemmatize_token'] = df.Corrected_content.apply(reflection_tokenizer)

In [195]:
df.head()

Unnamed: 0,content,top_1_score,top_2_score,Corrected_content,word_counts,average_score,lemmatize_token
0,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,3.5,"[understand, sense, note, help, understand, cl..."
1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,3.5,"[course, six, week, constantly, use, pretty, s..."
2,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,3.5,"[make, best, improvement, expand, idea, write,..."
3,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,3.5,"[knowledge, check, quiz, prepared, exam, help,..."
4,The study activity that I found the most helpf...,3,4,The study activity that I found the most helpf...,397,3.5,"[study, activity, found, helpful, review, know..."


In [196]:
df.to_csv('data_lemmatized')

In [197]:
token_vectorizer = CountVectorizer(tokenizer = reflection_tokenizer, min_df=10)


In [198]:
X = token_vectorizer.fit_transform(df.Corrected_content) # bag of words

In [199]:
type(X)

scipy.sparse.csr.csr_matrix

In [200]:
X.shape

(23150, 2267)

# Guided LDA

In [201]:
import guidedlda

In [202]:
tf_feature_names = token_vectorizer.get_feature_names()

In [203]:

word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))

In [204]:
seed_topic_list = [['attend', 'class', 'tutorial', 'lecture', 'participate', 'engage', 'discussion', 'focus', 'ask', 'question', 'concentrate'],
                   ['take', 'note', 'compare', 'classmate', 'highlight', 'underline', 'jot', 'write', 'topic', 'main', 'complete', 'point', 'copy', 'slide'],
                   ['read', 'study','review', 'skim', 'textbook', 'compare', 'note','connect', 'sketch', 'summarize', 'relationship', 'map', 'concept', 'diagram', 'chart'],
                   ['question', 'essay','assignment', 'exam', 'test', 'quiz', 'answer', 'practice', 'review', 'repeat', 'strength', 'weak', 'solve', 'problem', 'identify'],
                   ['stress', 'manage', 'anxiety', 'express', 'break', 'sleep', 'nap', 'eat', 'exercise'],
                   ['plan', 'calendar', 'task', 'list', 'manage', 'time', 'procrastinate', 'due','daily', 'weekly', 'track','schedule', 'date'],
                   ['group','partner', 'classmate', 'brainstorm', 'ask', 'answer', 'verify', 'peer', 'teach', 'clarify'],
                   ['ask', 'help','aid', 'resource', 'flashcard', 'tutor', 'peer', 'verify', 'explain', 'clear']]


In [205]:
seed_topic_headings = ['attending class/engaging', 'note-taking', 'read/study before class', 'prepare for exam','stress management',
                      'time management', 'group study', 'learning aid', 'customize learning strategy', 'study location']

In [206]:

model = guidedlda.GuidedLDA(n_topics=8, n_iter=100, random_state=7, refresh=10)
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 23150
INFO:guidedlda:vocab_size: 2267
INFO:guidedlda:n_words: 525601
INFO:guidedlda:n_topics: 8
INFO:guidedlda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -5081267
INFO:guidedlda:<10> log likelihood: -3539983
INFO:guidedlda:<20> log likelihood: -3415888
INFO:guidedlda:<30> log likelihood: -3374440
INFO:guidedlda:<40> log likelihood: -3352283
INFO:guidedlda:<50> log likelihood: -3334880
INFO:guidedlda:<60> log likelihood: -3321165
INFO:guidedlda:<70> log likelihood: -3311915
INFO:guidedlda:<80> log likelihood: -3303709
INFO:guidedlda:<90> log likelihood: -3298061
INFO:guidedlda:<99> log likelihood: -3292361


<guidedlda.guidedlda.GuidedLDA at 0x13278c4a8>

In [207]:
import joblib
joblib.dump(model, "Guided_LDA_10topics.pkl")

['Guided_LDA_10topics.pkl']

In [208]:
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
     topic_words = np.array(tf_feature_names)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
     print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: grade class work week try time well need good next
Topic 1: write work finish need reflection use make complete sol next
Topic 2: time read study note question use strategy well make take
Topic 3: question study time help make quiz well understand check answer
Topic 4: week work grade time class well try good need make
Topic 5: work week time well class thing need make really able
Topic 6: make answer question work also strategy think use time read
Topic 7: work grade ask try help class strategy math finish also


In [209]:
doc_topic = model.transform(X)
print(doc_topic)
# for i in range(23150):
#      print("top topic: {} Document: {}".format(doc_topic[i].argmax(),
#                                                   ', '.join(np.array(tf_feature_names)[list(reversed(X[i,:].argsort()))[0:8]])))

  if sparse and not np.issubdtype(doc_word.dtype, int):


[[1.11056648e-02 1.49442326e-04 3.71933036e-02 ... 4.51222119e-02
  2.22887218e-01 4.03580040e-02]
 [6.66402633e-04 1.74624407e-03 4.34701689e-02 ... 2.09869887e-01
  2.63159806e-02 1.50865167e-04]
 [4.08143075e-04 6.35168919e-01 1.04933267e-01 ... 1.32451245e-02
  2.42372636e-01 1.68792852e-03]
 ...
 [2.80024964e-03 7.29189957e-01 1.46335136e-01 ... 1.49491961e-03
  9.33225503e-02 3.17874453e-03]
 [9.43420344e-04 7.91713675e-03 5.60662715e-02 ... 6.20880291e-03
  8.99658901e-01 3.51480353e-04]
 [1.74111287e-03 1.16471252e-03 9.81117409e-01 ... 1.92806897e-03
  4.81025515e-03 7.38801206e-04]]


In [210]:
columns12 = ['topic {}'.format(i) for i in range(8)]  # number of topics
topic_vector = pd.DataFrame(doc_topic, columns = columns12)
topic_vector.round(2).head()

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7
0,0.01,0.0,0.04,0.58,0.06,0.05,0.22,0.04
1,0.0,0.0,0.04,0.68,0.04,0.21,0.03,0.0
2,0.0,0.64,0.1,0.0,0.0,0.01,0.24,0.0
3,0.01,0.0,0.05,0.81,0.01,0.0,0.12,0.01
4,0.0,0.0,0.04,0.84,0.0,0.09,0.02,0.0


In [211]:
len(X.toarray())

23150

In [212]:
def topic_threshold(doc_topic, topic_vector):
    """Return the topic number if the topic is more than 15% dominant"""
    topic_num_list = []
    for i in range(len(topic_vector)):
        topic_num = [idx for idx, value in enumerate(doc_topic[i]) if value > 0.2]
        if topic_num != []:
            topic_num = topic_num
        else:
            topic_num = 'None'
        topic_num_list.append(topic_num)
    return topic_num_list

In [213]:
num_topic=topic_threshold(doc_topic, topic_vector)

In [214]:
# num_topic = [doc_topic[i].argmax() for i in range(len(X.toarray()))]  # extract maximum topic number
print(num_topic)

[[3, 6], [3, 5], [1, 6], [3], [3], [3], [6], [3, 5], [5], [4, 5], [3], [5, 7], [0], [4, 6], [3], [3, 5], [4, 5], [3], [5], [3], [3, 5], [0], [5, 6], [3], [2, 3], [3, 5], [4, 5, 6], [0], [0], [0, 7], [3], [5, 6], [3, 5], [3], [3], [3], [3, 6], [5, 6], [3], [0], [1, 5, 6], [1], [6], [3], [4, 6], [2], [3, 5], [3, 5], [3, 6], [0, 4], [3, 4], [3, 5], [5], [3], [7], [3], [4, 5], [1, 5], [7], [7], [0], [7], [3, 4], [0], [7], [7], [5], [3], [5, 6], [7], [4, 5, 7], [7], [3], [1, 6], [0, 5], [6, 7], [0, 4], [7], [3], [1], [4, 5], [0, 7], [5, 7], [0, 7], [7], [7], [4, 5], [5], [7], [6, 7], [1, 6], [4], [4], [3], [0], [0, 7], [7], [4, 5], [5, 7], [0], [7], [0, 4], [7], [3], [0, 7], [7], [0], [0, 7], [0], [4, 6], [7], [7], [0], [7], [7], [0, 7], [7], [7], [4], [7], [0, 7], [3, 7], [7], [0], [6], [7], [7], [7], [4, 7], [0, 3], [4, 7], [7], [7], [7], [0, 7], [3], [7], [0, 6], [7], [7], [7], [6], [7], [7], [4, 7], [7], [1], [7], [7], [3], [1, 6, 7], [7], [7], [7], [0, 4], [0, 5], [7], [7], [7], [7], [

In [215]:
df_doc_topic = pd.DataFrame({'topics': num_topic, 'reflection': df.Corrected_content})

In [216]:
df_doc_topic.loc[5000:5050]

Unnamed: 0,reflection,topics
5000,My goal is to increase my score in Science. I ...,[0]
5001,"For problem-solving, I need to work on solving...",[3]
5002,fill: a color you apply to the inside of an ob...,[5]
5003,The Strategies that worked the best was when I...,"[0, 7]"
5004,SOL Goal 1: By 9:30 I will have completed my E...,[1]
5005,I will try to bring my lea grade up because it...,[7]
5006,This week it was very easy to get started work...,[5]
5007,This week went by fast for me because today is...,"[4, 5]"
5008,The goal I am going to set for myself is to ge...,[4]
5009,Last week my academics weren't the best. Since...,"[0, 4]"


In [217]:
df_doc_topic.reflection[5048]

'This six weeks some learning strategies I used were re-watching notes to help me understand the concept and studying before a quiz to be more prepared and get a good grade. In class I used my time wisely to complete the tasks and i check my work before turning it in. Next week I am going to be more organized and responsible with my work and finish any task i dot complete in class.'

In [218]:
df_doc_topic.reflection[23132]

'i actually made a checklist that i go through each day to make sure i have everything before i leave for school'

In [219]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(model, X, token_vectorizer, mds='tsne')
panel

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [140]:
pyLDAvis.save_html(panel, 'GuidedLDA_10topics')