<a href="https://colab.research.google.com/github/Rotem2411/alephBERTgimmelDalet/blob/main/Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#if necessary install below
!pip install pyLDAvis



In [2]:
import csv
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from collections import defaultdict
from gensim import models
from pprint import pprint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
file_path = 'sentiments.csv'
df = pd.read_csv(file_path)

  and should_run_async(code)


In [5]:
import warnings

# Filter out DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning, module="ipykernel")

Take a peek of the data

In [None]:
num_samples = len(df)
print("Total number of samples: ", num_samples)
print(df.head(10))

Total number of samples:  75151
   id                                               text tag category class  \
0   1                    האריות של הצל חזק פה בתגובות...   ש  ECONOMY    b1   
1   2  זמרת תעשייה רק מפרסומות מפורסמת\n  אבל אין קהל...   ש  ECONOMY    b1   
2   4  את לא יורקת לבאר שממנה שתית. יפה יפה, אבל חסרת...   ש  ECONOMY    b1   
3   5  שמעון\n  משתתף בפרסומת למילקי במקום לעורר מודע...   ש  ECONOMY    b1   
4   6  הצחקתם אותי\n  מה כל כך אמייזינג בחברה עם אפס ...   ש  ECONOMY    b1   
5   7  איסו חברת הייטק מצליחה צריכה פרסומות שיבואו לע...   ש  ECONOMY    b1   
6   8           לא מכיר אותה ולא שמעתי אפילו שיר אחד שלה   ש  ECONOMY    b1   
7   9  וואללה לא יודע מה מתלהבים מנגה ארז. זה בכלל לא...   ש  ECONOMY    b1   
8  10  אהובה שגיא\n  נגה גדולה.זמרת מעולה.פרסומת מדלי...   ח  ECONOMY    b1   
9  11  הפרסומת של אמדוקס נראית מאוד קודרת ואפורה, כמו...   ש  ECONOMY    b1   

   total_tags  selected_tag  polarity  
0           2             2       1.0  
1           2     

# **Data Pre-processing**

We will perform the following steps:


*   **Tokenization**: Split the text into sentences and the sentences into words
*   Remove **punctuation**.
*   All **stopwords** are removed.
*   Words are **lemmatized** — words in third person are changed to first person and verbs in past and future tenses are changed into present.
*   Words are **hebrew** — words with english letters or numbers are removed.
*   Words are more than one letter

In [6]:
def contains_english_letter(word):
    return any(c.isascii() for c in word)
def cleanwords(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('hebrew')
    tokens = nltk.word_tokenize(text)
    no_stopwords = [word for word in tokens if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    no_ascii = [word for word in no_alpha if not contains_english_letter(word)]
    no_one_letter = [word for word in no_ascii if len(word) > 1]
    lemm_text = [wn.lemmatize(word) for word in no_one_letter]
    return lemm_text

In [7]:
text = df['text'].tolist()
clean_text = [cleanwords(t) for t in text]

Comparison of the data before and after the preprocessing

In [58]:
text_comparison = pd.DataFrame({
    'text_before_preprocessed': text,
    'text_after_preprocessed': clean_text,})
display(text_comparison.head(10))

Unnamed: 0,text_before_preprocessed,text_after_preprocessed
0,האריות של הצל חזק פה בתגובות...,"[האריות, הצל, חזק, בתגובות]"
1,זמרת תעשייה רק מפרסומות מפורסמת\n אבל אין קהל...,"[זמרת, תעשייה, קהל, אמיתי, שיבוא, להופעות]"
2,"את לא יורקת לבאר שממנה שתית. יפה יפה, אבל חסרת...","[לבאר, שממנה, יפה, יפה, חסרת, עמוד, שדרה]"
3,שמעון\n משתתף בפרסומת למילקי במקום לעורר מודע...,"[שמעון, משתתף, בפרסומת, במקום, לעורר, מודעות, ..."
4,הצחקתם אותי\n מה כל כך אמייזינג בחברה עם אפס ...,"[הצחקתם, בחברה, אפס, נאמנות, לעובדים]"


Filter out words that occur less than 10 documents, or more than 50% of the documents.

In [8]:
dictionary = corpora.Dictionary(clean_text)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in clean_text]
print("Num. of words in the dictionary:", len(dictionary), "\nNum. of docs:", len(corpus))

Num. of words in the dictionary: 12429 
Num. of docs: 75151


# **TF-IDF**

In [9]:
tfidfmodel = models.TfidfModel(corpus, normalize=True)
tfidf_vector = tfidfmodel[corpus]

# **LDA**

In [10]:
ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=4)
lda_vector = ldamodel[tfidf_vector]

Dominante topic

In [11]:
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary=ldamodel.id2word)
pyLDAvis.save_html(vis, 'LDA_Visualization.html')

Dominante topic for each document

In [32]:
topicnames = ["Topic" + str(i) for i in range(ldamodel.num_topics)]
docnames = ["Doc" + str(i) for i in range(len(clean_text))]
Data = []
for row in lda_vector:
    Data.extend(row)
Data = [row[1] for row in Data]
Data = np.array(Data).reshape(len(docnames), len(topicnames))
df_document_topic = pd.DataFrame(data=Data, columns=topicnames, index=docnames)
df_document_topic['dominante_topic'] = np.argmax(df_document_topic.values, axis=1) # Get dominante topic for each document
def make_bold(val):
  weight = 700 if val > .25 else 400
  return "font-weight: {weight}".format(weight=weight)
display(df_document_topic.head(15).style.applymap(make_bold))

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,dominante_topic
Doc0,0.08479,0.085502,0.233668,0.59604,3
Doc1,0.075691,0.773432,0.075515,0.075362,1
Doc2,0.464578,0.21108,0.25124,0.073102,0
Doc3,0.147423,0.616237,0.113744,0.122596,1
Doc4,0.763378,0.079146,0.079471,0.078004,0
Doc5,0.28271,0.520207,0.148079,0.049004,1
Doc6,0.076646,0.770651,0.076381,0.076322,1
Doc7,0.04797,0.128512,0.652435,0.171083,2
Doc8,0.083892,0.263682,0.565527,0.086899,2
Doc9,0.091969,0.465758,0.352863,0.08941,1


 # **Best LDA model**

Using GridSearch to determine the best LDA model

In [None]:
# Define Search Param
search_params = {'n_components': [3, 4, 5]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(df_tfidfvect)

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(df_tfidfvect))

Dominant topic

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(text_after_preprocessed))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
  color = 'green' if val > .8 else 'black'
  return 'color: {col}'.format(col=color)
def make_bold(val):
  weight = 700 if val > .25 else 400
  return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

SyntaxError: invalid character '“' (U+201C) (<ipython-input-15-038e417f1f42>, line 4)

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = tfidf_tokens
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(tfidfvectorizer, best_lda_model, n_words=50)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords).transpose()
df_topic_keywords.columns = ['Topic '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Word '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords