<a href="https://colab.research.google.com/github/Rotem2411/alephBERTgimmelDalet/blob/main/Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings

# Filter out DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning, module="ipykernel")

In [1]:
import csv
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from pprint import pprint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
file_path = 'sentiments.csv'
df = pd.read_csv(file_path)

Take a peek of the data

In [5]:
num_samples = len(df)
print("Total number of samples: ", num_samples)
print(df.head(10))

Total number of samples:  75151
   id                                               text tag category class  \
0   1                    האריות של הצל חזק פה בתגובות...   ש  ECONOMY    b1   
1   2  זמרת תעשייה רק מפרסומות מפורסמת\n  אבל אין קהל...   ש  ECONOMY    b1   
2   4  את לא יורקת לבאר שממנה שתית. יפה יפה, אבל חסרת...   ש  ECONOMY    b1   
3   5  שמעון\n  משתתף בפרסומת למילקי במקום לעורר מודע...   ש  ECONOMY    b1   
4   6  הצחקתם אותי\n  מה כל כך אמייזינג בחברה עם אפס ...   ש  ECONOMY    b1   
5   7  איסו חברת הייטק מצליחה צריכה פרסומות שיבואו לע...   ש  ECONOMY    b1   
6   8           לא מכיר אותה ולא שמעתי אפילו שיר אחד שלה   ש  ECONOMY    b1   
7   9  וואללה לא יודע מה מתלהבים מנגה ארז. זה בכלל לא...   ש  ECONOMY    b1   
8  10  אהובה שגיא\n  נגה גדולה.זמרת מעולה.פרסומת מדלי...   ח  ECONOMY    b1   
9  11  הפרסומת של אמדוקס נראית מאוד קודרת ואפורה, כמו...   ש  ECONOMY    b1   

   total_tags  selected_tag  polarity  
0           2             2       1.0  
1           2     

# **Data Pre-processing**

We will perform the following steps:


*   **Tokenization**: Split the text into sentences and the sentences into words
*   Remove **punctuation**.
*   All **stopwords** are removed.
*   Words are **lemmatized** — words in third person are changed to first person and verbs in past and future tenses are changed into present.
*   Words are **hebrew** — words with english letters or numbers are removed.

In [3]:
def contains_english_letter(word):
    return any(c.isascii() for c in word)

In [4]:
def cleanwords(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('hebrew')
    tokens = nltk.word_tokenize(text)
    no_stopwords = [word for word in tokens if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    no_ascii = [word for word in no_alpha if not contains_english_letter(word)]
    lemm_text = [wn.lemmatize(word) for word in no_ascii]
    return lemm_text

In [5]:
text_before_preprocessed  = df['text']
text_after_preprocessed = {}
for i in text_before_preprocessed.index:
  text_after_preprocessed[i] = cleanwords(text_before_preprocessed[i])

In [9]:
text_comparison = pd.DataFrame({
    'text_before_preprocessed': text_before_preprocessed,
    'text_after_preprocessed': text_after_preprocessed,
})
display(text_comparison.head(5))

Unnamed: 0,text_before_preprocessed,text_after_preprocessed
0,האריות של הצל חזק פה בתגובות...,"[האריות, הצל, חזק, בתגובות]"
1,זמרת תעשייה רק מפרסומות מפורסמת\n אבל אין קהל...,"[זמרת, תעשייה, מפרסומות, מפורסמת, קהל, אמיתי, ..."
2,"את לא יורקת לבאר שממנה שתית. יפה יפה, אבל חסרת...","[יורקת, לבאר, שממנה, שתית, יפה, יפה, חסרת, עמו..."
3,שמעון\n משתתף בפרסומת למילקי במקום לעורר מודע...,"[שמעון, משתתף, בפרסומת, למילקי, במקום, לעורר, ..."
4,הצחקתם אותי\n מה כל כך אמייזינג בחברה עם אפס ...,"[הצחקתם, אמייזינג, בחברה, אפס, נאמנות, לעובדים]"


In [6]:
text_after_list = []
for i in range(len(text_after_preprocessed)):
    text_after_list.append(' '.join(str(e) for e in text_after_preprocessed[i]))

# **TF-IDF**

In [7]:
idx = []
for i in range(len(text_after_list)):
    idx.append('doc ' + str(i))

tfidfvectorizer = TfidfVectorizer() # instantiate the vectorizer object
tfidf_wm = tfidfvectorizer.fit_transform(text_after_list) # convert th documents into a matrix
tfidf_tokens = tfidfvectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = idx, columns = tfidf_tokens)

# **LDA**

In [13]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=4,               # Number of topics
                                      max_iter=5,                  # Max learning iterations
                                      random_state=100,          # Random state
                                     )
lda_output = lda_model.fit_transform(df_tfidfvect)
print(lda_model)  # Model attributes

LatentDirichletAllocation(max_iter=5, n_components=4, random_state=100)


Diagnose model performance with perplexity and log-likelihood

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(df_tfidfvect))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(df_tfidfvect))
# See model parameters
pprint(lda_model.get_params())

Dominant topic

In [23]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(text_after_preprocessed))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
  color = 'green' if val > .8 else 'black'
  return 'color: {col}'.format(col=color)
def make_bold(val):
  weight = 700 if val > .25 else 400
  return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,dominant_topic
Doc0,0.09,0.09,0.09,0.74,3
Doc1,0.78,0.07,0.07,0.07,0
Doc2,0.07,0.07,0.79,0.07,2
Doc3,0.06,0.05,0.05,0.84,3
Doc4,0.08,0.08,0.08,0.76,3
Doc5,0.05,0.05,0.04,0.86,3
Doc6,0.08,0.77,0.08,0.08,1
Doc7,0.04,0.04,0.88,0.04,2
Doc8,0.07,0.77,0.07,0.08,1
Doc9,0.07,0.08,0.07,0.78,3


In [33]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = tfidf_tokens
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,אאא,אאאא,אאאאוב,אאוהבםי,אאוט,אאוטלט,אאוטסורס,אאוטסורסינג,אאוטסיידר,אאוצ,...,תתרמי,תתרסק,תתרסקו,תתשדל,תתשפט,תתתנו,תתתת,ײסודה,בּטוח,בּמקצועם
Topic0,0.250096,0.250133,0.693101,0.425112,0.25022,0.250133,0.257378,0.250097,0.392831,0.250179,...,0.250118,0.87056,0.25017,0.25012,0.250104,0.252936,0.726342,0.429351,0.250103,0.250103
Topic1,0.250118,0.250164,0.250208,0.250111,0.573873,2.767795,0.250148,0.250103,0.250083,0.898556,...,0.250129,0.505283,0.842984,0.250147,0.250131,0.398933,0.250193,0.25011,0.250144,0.250144
Topic2,0.250116,0.761237,0.250208,0.250112,0.250315,0.250191,0.497816,0.543323,0.250095,0.250202,...,0.250144,1.167789,0.250185,0.681673,0.623669,0.250105,0.250216,0.250106,0.250116,0.250116
Topic3,1.244462,0.250183,0.25035,0.250128,0.603101,0.250168,0.25018,0.250124,0.250116,0.250256,...,0.692143,0.250441,0.250225,0.250157,0.250145,0.250151,0.25022,0.250117,0.655468,0.655468


Top 30 keywords each topic

In [79]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(tfidfvectorizer, lda_model, n_words=50)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords).transpose()
df_topic_keywords.columns = ['Topic '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Word '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3
Word 0,צריך,כמה,צריך,כתבה
Word 1,שלא,כסף,ולא,טוב
Word 2,ביבי,עוד,כבר,לך
Word 3,כבר,ביבי,עוד,שלא
Word 4,ולא,ולא,ביבי,עוד
Word 5,לך,לך,שלא,צריך
Word 6,אחד,שלא,המדינה,כבר
Word 7,הכל,ישראל,ישראל,אחד
Word 8,הרבה,ממש,צודק,אנשים
Word 9,עוד,זו,הזה,ולא


In [51]:
# Topic-Keyword Matrix
df_Topic_keywords = pd.DataFrame(lda_model.components_)
# Assign Column and Index
df_Topic_keywords.columns = tfidf_tokens
df_Topic_keywords.index = topicnames
# View
df_Topic_keywords.head()

Unnamed: 0,אאא,אאאא,אאאאוב,אאוהבםי,אאוט,אאוטלט,אאוטסורס,אאוטסורסינג,אאוטסיידר,אאוצ,...,תתרמי,תתרסק,תתרסקו,תתשדל,תתשפט,תתתנו,תתתת,ײסודה,בּטוח,בּמקצועם
Topic0,0.250096,0.250133,0.693101,0.425112,0.25022,0.250133,0.257378,0.250097,0.392831,0.250179,...,0.250118,0.87056,0.25017,0.25012,0.250104,0.252936,0.726342,0.429351,0.250103,0.250103
Topic1,0.250118,0.250164,0.250208,0.250111,0.573873,2.767795,0.250148,0.250103,0.250083,0.898556,...,0.250129,0.505283,0.842984,0.250147,0.250131,0.398933,0.250193,0.25011,0.250144,0.250144
Topic2,0.250116,0.761237,0.250208,0.250112,0.250315,0.250191,0.497816,0.543323,0.250095,0.250202,...,0.250144,1.167789,0.250185,0.681673,0.623669,0.250105,0.250216,0.250106,0.250116,0.250116
Topic3,1.244462,0.250183,0.25035,0.250128,0.603101,0.250168,0.25018,0.250124,0.250116,0.250256,...,0.692143,0.250441,0.250225,0.250157,0.250145,0.250151,0.25022,0.250117,0.655468,0.655468


In [50]:
!pip install pyLDAvis

# Prepare visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, tfidf_wm, tfidfvectorizer.vocabulary_)

# Save visualization
pyLDAvis.save_html(vis, 'new_res/LDA_Visualization.html')

  and should_run_async(code)


AttributeError: 'dict' object has no attribute 'token2id'

 # **Best LDA model**

Using GridSearch to determine the best LDA model

In [None]:
# Define Search Param
search_params = {'n_components': [3, 4, 5]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(df_tfidfvect)

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(df_tfidfvect))

Dominant topic

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(text_after_preprocessed))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
  color = 'green' if val > .8 else 'black'
  return 'color: {col}'.format(col=color)
def make_bold(val):
  weight = 700 if val > .25 else 400
  return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

SyntaxError: invalid character '“' (U+201C) (<ipython-input-15-038e417f1f42>, line 4)

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = tfidf_tokens
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(tfidfvectorizer, best_lda_model, n_words=50)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords).transpose()
df_topic_keywords.columns = ['Topic '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Word '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords