In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim_models
import nltk

In [2]:
# pip install pandas nltk gensim pyLDAvis

In [4]:
df = pd.read_csv('TOEFL/TOEFL-dataset/toefl_train.csv')

In [5]:
df

Unnamed: 0,Filename,text,Language,Proficiency
0,88.txt,Some people might think that traveling in a gr...,KOR,high
1,278.txt,IThe importance and popularity of travelling i...,DEU,medium
2,348.txt,"It is an important decision, how to plan your ...",TUR,high
3,666.txt,Some people believe that young people can enjo...,ZHO,medium
4,733.txt,Travelling is usually considered as good recr...,TEL,medium
...,...,...,...,...
10995,1999007.txt,"Nowadays, more and more people go abroad,no ma...",ZHO,medium
10996,1999198.txt,\tIn accomplishing something that is risky com...,KOR,high
10997,1999363.txt,"At the beginning of the 21st century, the incr...",SPA,high
10998,1999931.txt,The number of cars in use across the world has...,HIN,high


In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Define a function for text cleaning
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# Clean the essays
doc_clean = [clean(doc).split() for doc in df['text']]

# Create the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(doc_clean)

# Convert list of essays (doc_clean) into a Document Term Matrix using the dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Create the object for LDA model
lda = LdaMulticore

# Train the LDA model
lda_model = lda(corpus=doc_term_matrix, id2word=dictionary, num_topics=8, passes=400)

# Output the topics
topics = lda_model.print_topics(num_words=10)

for topic in topics:
    print(topic)

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
vis

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sony\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(0, '0.046*"fact" + 0.043*"idea" + 0.036*"concept" + 0.035*"student" + 0.023*"understand" + 0.021*"learn" + 0.016*"important" + 0.012*"learning" + 0.011*"understanding" + 0.009*"know"')
(1, '0.024*"people" + 0.024*"new" + 0.022*"thing" + 0.018*"risk" + 0.014*"successful" + 0.013*"guide" + 0.013*"tour" + 0.012*"take" + 0.012*"travel" + 0.011*"try"')
(2, '0.043*"people" + 0.036*"community" + 0.032*"young" + 0.026*"time" + 0.017*"help" + 0.014*"helping" + 0.009*"give" + 0.008*"enough" + 0.008*"work" + 0.007*"think"')
(3, '0.066*"people" + 0.047*"life" + 0.034*"young" + 0.028*"enjoy" + 0.023*"older" + 0.014*"old" + 0.013*"time" + 0.009*"thing" + 0.009*"think" + 0.007*"age"')
(4, '0.053*"product" + 0.034*"advertisement" + 0.018*"make" + 0.014*"better" + 0.013*"people" + 0.013*"really" + 0.011*"company" + 0.010*"much" + 0.010*"buy" + 0.009*"good"')
(5, '0.001*"thet" + 0.001*"da" + 0.001*"land" + 0.001*"vinci" + 0.001*"ph" + 0.000*"theri" + 0.000*"godot" + 0.000*"salad" + 0.000*"premature" + 

  default_term_info = default_term_info.sort_values(


In [17]:
# Get the top words for each topic
topic_dict = {i: lda_model.show_topic(i, topn=10) for i in range(lda_model.num_topics)}

# Print the top words for each topic
for topic, words in topic_dict.items():
    print(f"Top words for topic {topic}: {words}")

Top words for topic 0: [('fact', 0.045686275), ('idea', 0.042858385), ('concept', 0.036317937), ('student', 0.0350185), ('understand', 0.022757052), ('learn', 0.021211311), ('important', 0.01609421), ('learning', 0.012413641), ('understanding', 0.011337963), ('know', 0.009003374)]
Top words for topic 1: [('people', 0.024395471), ('new', 0.023813521), ('thing', 0.02184827), ('risk', 0.018125823), ('successful', 0.0144557925), ('guide', 0.013362522), ('tour', 0.013167825), ('take', 0.011921553), ('travel', 0.011624806), ('try', 0.011359687)]
Top words for topic 2: [('people', 0.04273121), ('community', 0.036482386), ('young', 0.031629413), ('time', 0.026489662), ('help', 0.017091906), ('helping', 0.0137439165), ('give', 0.008900198), ('enough', 0.00809271), ('work', 0.0076554683), ('think', 0.0074864104)]
Top words for topic 3: [('people', 0.06596111), ('life', 0.046640538), ('young', 0.033938415), ('enjoy', 0.02781788), ('older', 0.023204675), ('old', 0.014406304), ('time', 0.012610988)

In [8]:
from joblib import dump, load

# Save the model
dump(lda_model, 'lda_model.joblib') 

# Load the model
#lda_model = load('lda_model.joblib')

['lda_model.joblib']

In [3]:
# Load the model
from joblib import dump, load
lda_model = load('lda_model.joblib')

In [7]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Initialize output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the dominant topic, percentage contribution and keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Percentage_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=df['text'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.9576,"people, new, thing, risk, successful, guide, t...",Some people might think that traveling in a gr...
1,1,1.0,0.9936,"people, new, thing, risk, successful, guide, t...",IThe importance and popularity of travelling i...
2,2,7.0,0.5728,"subject, knowledge, one, many, academic, broad...","It is an important decision, how to plan your ..."
3,3,3.0,0.9599,"people, life, young, enjoy, older, old, time, ...",Some people believe that young people can enjo...
4,4,1.0,0.9945,"people, new, thing, risk, successful, guide, t...",Travelling is usually considered as good recr...
5,5,3.0,0.9902,"people, life, young, enjoy, older, old, time, ...",i agree that . \nLife is a person live period...
6,6,1.0,0.9939,"people, new, thing, risk, successful, guide, t...","In my opinion, travel in group with a tour gui..."
7,7,2.0,0.5644,"people, community, young, time, help, helping,...",I thing the statement ''Young people nowadays ...
8,8,3.0,0.946,"people, life, young, enjoy, older, old, time, ...",Whether or not young people enjoy life more th...
9,9,0.0,0.995,"fact, idea, concept, student, understand, lear...",In the era of science and technology...


In [15]:
# Prepare the unseen documents
unseen_docs = df['text'].head(10)
unseen_docs = [clean(doc).split() for doc in unseen_docs]  # Use your clean function

# Convert them into bag-of-words vectors
unseen_docs_bow = [dictionary.doc2bow(doc) for doc in unseen_docs]

# Now you can feed them to lda_model
unseen_topics = [lda_model[doc_bow] for doc_bow in unseen_docs_bow]

In [16]:
# Get the dominant topic for each document
unseen_dominant_topics = [max(doc_topics, key=lambda x: x[1])[0] for doc_topics in unseen_topics]

print(unseen_dominant_topics)

[1, 1, 7, 3, 1, 3, 1, 2, 3, 0]


In [18]:
unseen_topic_words = [topic_dict[topic_num] for topic_num in unseen_dominant_topics]

print(unseen_topic_words)

[[('people', 0.024395471), ('new', 0.023813521), ('thing', 0.02184827), ('risk', 0.018125823), ('successful', 0.0144557925), ('guide', 0.013362522), ('tour', 0.013167825), ('take', 0.011921553), ('travel', 0.011624806), ('try', 0.011359687)], [('people', 0.024395471), ('new', 0.023813521), ('thing', 0.02184827), ('risk', 0.018125823), ('successful', 0.0144557925), ('guide', 0.013362522), ('tour', 0.013167825), ('take', 0.011921553), ('travel', 0.011624806), ('try', 0.011359687)], [('subject', 0.045993645), ('knowledge', 0.03403741), ('one', 0.026305027), ('many', 0.019396784), ('academic', 0.014336031), ('broad', 0.013891191), ('specific', 0.01378671), ('better', 0.009964474), ('person', 0.009355513), ('people', 0.009077291)], [('people', 0.06596111), ('life', 0.046640538), ('young', 0.033938415), ('enjoy', 0.02781788), ('older', 0.023204675), ('old', 0.014406304), ('time', 0.012610988), ('thing', 0.0094068935), ('think', 0.00904187), ('age', 0.0070903082)], [('people', 0.024395471), (