<a href="https://colab.research.google.com/github/WaelMohsen/.NET-Backend-Developer-Roadmap/blob/master/POC_Classification_Per_Topic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
#Setup and Constructions

In [None]:
!pip install bertopic==0.16.0 datasets==2.16.1 Arabic-Stopwords==0.4.3

In [None]:
from datasets import load_dataset
import pandas as myPandas
import re
import random

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as myMatPlot
import seaborn as mySeaBorn

from datetime import datetime

---
#Data Cleansing function

In [None]:
def clean_text(text: str):
    if not isinstance(text, str):
        # raise TypeError("text must be a string")
        return ""

    # remove urls
    text = re.sub(r"http\S+", "URL", text)

    # replace any digit with Numeric
    text = re.sub(r"\d+", "Numeric", text)

    # set space before and after any punctuation
    text = re.sub(r"([^\w\s])", r" \1 ", text)

    # remove extra spaces
    text = re.sub(r"\s+", " ", text)

    words = word_tokenize(text)
    text = " ".join([ w for w in words if len(w) > 1 ] )

    return text.lower().strip()



---


#Load Data Sets

In [None]:
englishDataSet = load_dataset("m-ric/english_historical_quotes")

In [None]:
feedbackDataSet = load_dataset("kaist-ai/Feedback-Collection")

In [None]:
newsDataSet = load_dataset("saudinewsnet")

In [None]:
englishDataSet['train'][0]

In [None]:
feedbackDataSet['train'][0]

In [None]:
newsDataSet['train'][0]



---


#Build Dictionary for each dataset

In [None]:
dic_newsDataSet = [
    {
        "text": record["content"],
        "source": record["source"],
        "date": record["date_extracted"]
    }
    for record in newsDataSet['train']
]

dic_feedbackDataSet = [
    {
        "instruction": record["orig_instruction"],
        "feedback": record["orig_feedback"],
        "response": record["orig_response"],
        "score": record["orig_score"]
    }
    for record in feedbackDataSet['train']
]

dic_englishDataSet = [
    {
        "text": record["quote"],
        "source": record["author"]
    }
    for record in englishDataSet['train']
]



---


#Convert the Dictionaries to Data Frames


In [None]:
newsDataFrame = myPandas.DataFrame( dic_newsDataSet )
newsDataFrame = newsDataFrame.sample(frac=1, random_state = 611)

feedbackDataFrame = myPandas.DataFrame( dic_feedbackDataSet )
feedbackDataFrame = feedbackDataFrame.sample(frac=1, random_state = 611)

englishDataFrame = myPandas.DataFrame( dic_englishDataSet )
englishDataFrame = englishDataFrame.sample(frac=1, random_state = 611)

In [None]:
newsDataFrame.head()

In [None]:
feedbackDataFrame.head()

In [None]:
englishDataFrame.head()



---


#Clean the data of type string/text


In [None]:
newsDataFrame['text'] = newsDataFrame['text'].apply(clean_text)

# feedbackDataFrame['instruction'] = feedbackDataFrame['instruction'].apply(clean_text)
# feedbackDataFrame['feedback'] = feedbackDataFrame['feedback'].apply(clean_text)
# feedbackDataFrame['response'] = feedbackDataFrame['response'].apply(clean_text)

englishDataFrame['text'] = englishDataFrame['text'].apply(clean_text)

---
#Add extra length field to describe the data length



In [None]:
newsDataFrame['text_length'] = newsDataFrame['text'].apply(len)

feedbackDataFrame['instruction_length'] = feedbackDataFrame['instruction'].apply(len)
feedbackDataFrame['feedback_length'] = feedbackDataFrame['feedback'].apply(len)
feedbackDataFrame['response_length'] = feedbackDataFrame['response'].apply(len)

englishDataFrame['text_length'] = englishDataFrame['text'].apply(len)

In [None]:
newsDataFrame.head()

In [None]:
#feedbackDataFrame.head()
englishDataFrame.head()

---
#Visualize the data over a histograms




In [None]:
myMatPlot.figure(figsize=(12, 6))
mySeaBorn.histplot(englishDataFrame['text_length'], bins=50)

#mySeaBorn.histplot(feedbackDataFrame['instruction_length'], bins=50)
#mySeaBorn.histplot(feedbackDataFrame['feedback_length'], bins=50)
#mySeaBorn.histplot(feedbackDataFrame['response_length'], bins=50)

#mySeaBorn.histplot(newsDataFrame['text_length'], bins=50)

In [None]:
myMatPlot.figure(figsize=(12, 6))
#mySeaBorn.histplot(englishDataFrame['text_length'], bins=50)

#mySeaBorn.histplot(feedbackDataFrame['instruction_length'], bins=50)
#mySeaBorn.histplot(feedbackDataFrame['feedback_length'], bins=50)
#mySeaBorn.histplot(feedbackDataFrame['response_length'], bins=50)

mySeaBorn.histplot(newsDataFrame['text_length'], bins=50)

---
#Add more cleansing based on the length and duplications




In [None]:
#--------------------------------------------------------------------------------------------------------------
print("newsDataFrame = ", newsDataFrame.shape)
newsDataFrame = newsDataFrame[ newsDataFrame['text_length'] <= 6000 ]
newsDataFrame.drop_duplicates(['text'], inplace = True)
print("newsDataFrame = ", newsDataFrame.shape)

#--------------------------------------------------------------------------------------------------------------
print("englishDataFrame = ", englishDataFrame.shape)
englishDataFrame = englishDataFrame[ englishDataFrame['text_length'] <= 270 ]
englishDataFrame.drop_duplicates(['text'], inplace = True)
print("englishDataFrame = ", englishDataFrame.shape)

#--------------------------------------------------------------------------------------------------------------
print("feedbackDataFrame = ", feedbackDataFrame.shape)

feedbackDataFrame = feedbackDataFrame[ feedbackDataFrame['instruction_length'] <= 800 ]
feedbackDataFrame = feedbackDataFrame[ feedbackDataFrame['feedback_length'] <= 1200 ]
feedbackDataFrame = feedbackDataFrame[ feedbackDataFrame['response_length'] <= 2000 ]

feedbackDataFrame.drop_duplicates(['instruction'], inplace = True)
feedbackDataFrame.drop_duplicates(['feedback'], inplace = True)
feedbackDataFrame.drop_duplicates(['response'], inplace = True)

print("feedbackDataFrame = ", feedbackDataFrame.shape)

---
#Convert columns to its correct formate then extract the Date only with no time


In [None]:
newsDataFrame['date_Stamp'] = newsDataFrame['date'].apply(lambda row: datetime.strptime(row, "%Y-%m-%d %H:%M:%S"))
newsDataFrame['date_Stamp'] = newsDataFrame['date_Stamp'].apply(lambda row: row.replace(hour=0, minute=0, second=0))
newsDataFrame.head()

---
⛹
# **Applying the Per-Topic Algorithm. The following steps are completely modular:**

1.  Embedding documents using Transformers (Sentence Transformers)
2.  Reducing embeddings dimensionality
3.  Clustering reduced embeddings into topics
4.  Tokenization of topics
5.  Set Weights for the tokens
6.  Represent topics with one or multiple representations

In [None]:
#=============================================================================================
# 1- Embedding documents using Transformers (Sentence Transformers)
from sentence_transformers import SentenceTransformer

newsSentenceTransformer_ModelId = "sentence-transformers/distiluse-base-multilingual-cased-v2"  # "sentence-transformers/LaBSE"
newsEmbeddingModel = SentenceTransformer(newsSentenceTransformer_ModelId, device= "cuda:0")
newsEmbeddings = newsEmbeddingModel.encode(newsDataFrame['text'].values, show_progress_bar= True)

englishSentenceTransformer_ModelId = "sentence-transformers/all-MiniLM-L12-v2"
englishEmbeddingModel = SentenceTransformer(englishSentenceTransformer_ModelId, device= "cuda:0")
englishEmbeddings = englishEmbeddingModel.encode(englishDataFrame['text'].values, show_progress_bar= True)

# feedbackInstructionEmbeddings = embeddingModel.encode(feedbackDataFrame['instruction'].values, show_progress_bar= True)
# feedbackFeedbackEmbeddings = embeddingModel.encode(feedbackDataFrame['feedback'].values, show_progress_bar= True)
# feedbackResponseEmbeddings = embeddingModel.encode(feedbackDataFrame['response'].values, show_progress_bar= True)

print(newsEmbeddings.shape)
print(englishEmbeddings.shape)
# print(feedbackInstructionEmbeddings.shape)
# print(feedbackFeedbackEmbeddings.shape)
# print(feedbackResponseEmbeddings.shape)

In [None]:
#=============================================================================================
#2- Reducing embeddings dimensionality
from umap import UMAP
newsDimensionalityReductionModel = UMAP(n_neighbors = 15, n_components = 15,
                  min_dist = 0.0, metric = 'cosine',
                  random_state = 611)

englishDimensionalityReductionModel = UMAP(n_neighbors = 15, n_components = 15,
                  min_dist = 0.0, metric = 'cosine',
                  random_state = 611)

In [None]:
#=============================================================================================
# 3- Clustering reduced embeddings into topics
from hdbscan import HDBSCAN

# A higher min_cluster_size will generate fewer topics
# A lower min_cluster_size will generate more topics.
newsClustringModel = HDBSCAN(min_cluster_size = 50,
                        metric = 'euclidean',
                        cluster_selection_method = 'eom',
                        prediction_data = True)

englishClustringModel = HDBSCAN(min_cluster_size = 50,
                        metric = 'euclidean',
                        cluster_selection_method = 'eom',
                        prediction_data = True)

In [None]:
#=============================================================================================
# 4- Tokenization of topics (Vectorizer)
from sklearn.feature_extraction.text import CountVectorizer
import arabicstopwords.arabicstopwords as MyArabicStopWords

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Load Arabic stop words
arabicWordsIgnoreList = MyArabicStopWords.stopwords_list()

# Load English stop words
englishWordsIgnoreList = list(stopwords.words('english'))

newsVectorizerModel = CountVectorizer(min_df = 3,
                                   stop_words = arabicWordsIgnoreList,
                                   analyzer = 'word',
                                   max_df = 0.5,
                                   ngram_range = (1, 3)
                                   )

englishVectorizerModel = CountVectorizer(min_df = 3,
                                   stop_words = englishWordsIgnoreList,
                                   analyzer = 'word',
                                   max_df = 0.5,
                                   ngram_range = (1, 3)
                                   )

In [None]:
#============================================================================================
#Topic Representer
from bertopic.representation import KeyBERTInspired

newsKeyPerToicModel = KeyBERTInspired()
newsTopicRepresenter_model = {
    "KeyBERT": newsKeyPerToicModel
}

englishKeyPerToicModel = KeyBERTInspired()
englishTopicRepresenter_model = {
    "KeyBERT": englishKeyPerToicModel
}



---
# Now, let's run the model to wire all together


In [None]:
#============================================================================================
#Let's Go
from bertopic import BERTopic

# Initializing the news BERTopic Model
newsTopicsModel = BERTopic(
    # Pipeline models
    embedding_model = newsEmbeddingModel,
    umap_model = newsDimensionalityReductionModel,
    hdbscan_model = newsClustringModel,
    vectorizer_model = newsVectorizerModel,
    representation_model = newsTopicRepresenter_model,

    # Hyperparameters
    top_n_words = 10,
    verbose = True)

newsTopics, newsTopicsPropabilities = newsTopicsModel.fit_transform(
    newsDataFrame['text'].values,
    newsEmbeddings)

newsDataFrame['topic'] = newsTopics
newsDataFrame['Probability'] = newsTopicsPropabilities

In [None]:
from bertopic import BERTopic

  # Initializing the news BERTopic Model
englishTopicsModel = BERTopic(
    # Pipeline models
    embedding_model = englishEmbeddingModel,
    umap_model = englishDimensionalityReductionModel,
    hdbscan_model = englishClustringModel,
    vectorizer_model = englishVectorizerModel,
    representation_model = englishTopicRepresenter_model,

    # Hyperparameters
    top_n_words = 10,
    verbose = True)

englishTopics, englishTopicsPropabilities = englishTopicsModel.fit_transform(
    englishDataFrame['text'].values,
    englishEmbeddings)

englishDataFrame['topic'] = englishTopics
englishDataFrame['Probability'] = englishTopicsPropabilities

In [None]:
newsTopicsModel.get_topic_info()

In [None]:
englishTopicsModel.get_topic_info()

In [None]:
englishTopicsModel.get_topic(7)

In [None]:
englishDataFrame[ englishDataFrame['topic'] == 7 ].head(10)

In [None]:
newsTopicsModel.get_topic(3)

In [None]:
newsDataFrame[ newsDataFrame['topic'] == 3 ].head(10)

In [None]:
newsTopicsModel.visualize_topics()

In [None]:
englishTopicsModel.visualize_topics()

In [None]:
englishTopicsModel.visualize_heatmap()

In [None]:
newsTopicsModel.visualize_heatmap()

In [None]:
newsTopicsPerClass = newsTopicsModel.topics_per_class(
    newsDataFrame['text'].values,
    classes=newsDataFrame['source'].values)

newsTopicsModel.visualize_topics_per_class(newsTopicsPerClass, top_n_topics = 15, normalize_frequency = True)

In [None]:
newsTopicsOverTime = newsTopicsModel.topics_over_time(
    newsDataFrame['text'].values,
    newsDataFrame['date'].values, nr_bins = 10)

newsTopicsModel.visualize_topics_over_time(
    newsTopicsOverTime,
    topics = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [None]:
newsTopicsHierarchy = newsTopicsModel.hierarchical_topics(
    newsDataFrame['text'].values)

newsTopicsModel.visualize_hierarchy(
    hierarchical_topics = newsTopicsHierarchy)

In [None]:
englishTopicsPerClass = englishTopicsModel.topics_per_class(
    englishDataFrame['text'].values,
    classes = englishDataFrame['source'].values
)

englishTopicsModel.visualize_topics_per_class(englishTopicsPerClass, top_n_topics = 15, normalize_frequency = True)

In [None]:
englishTopicsHierarchy = englishTopicsModel.hierarchical_topics(
    englishDataFrame['text'].values)

englishTopicsModel.visualize_hierarchy(
    hierarchical_topics = englishTopicsHierarchy)

---
#Review the resutls and visualization then we can enhancements by merging topics

In [None]:
englishTopicsToMerge = [
    [25, 24],
    [43, 3, 12, 44]
]

englishTopicsModel.merge_topics(
    englishDataFrame['text'].values,
    englishTopicsToMerge
)

In [None]:
englishTopicsHierarchy = englishTopicsModel.hierarchical_topics(
    englishDataFrame['text'].values)

englishTopicsModel.visualize_hierarchy(
    hierarchical_topics = englishTopicsHierarchy)


---
# Review the resutls and visualization then we can enhancements by Topics Filtering



In [None]:
englishDataFrame = englishDataFrame[ ~englishDataFrame['topic'].isin([-1]) ]
englishDataFrame.head()

In [None]:
newsDataFrame = newsDataFrame[ ~newsDataFrame['topic'].isin([-1]) ]
newsDataFrame.head()

---
#Now, if we decide this is enough for the production environment, we can move on to the inference steps to check a stream of live data.



In [None]:
live_newsStory = """اتهمت منظمة "هيومن رايتس ووتش" اسرائيل، اليوم (الاثنين)، باستخدام "القوة غير المبررة" لاعتقال اطفال فلسطينيين تصل اعمار بعضهم الى 11 عاما، بالاضافة الى استخدام التهديد لاجبارهم بالتوقيع على اعترافات.
وقالت المنظمة ان السلطات الاسرائيلية فشلت في اخطار اهالي الاطفال عن اعتقالهم او اماكن احتجازهم، مستندة الى شهادات عدة اطفال احتجزوا العام الماضي في القدس الشرقية والضفة الغربية المحتلتين، في وقت ساد فيه توتر كبير.
وحثت سارة ليا ويتسن مديرة قسم الشرق الأوسط وشمال أفريقيا الولايات المتحدة على الضغط على حليفتها اسرائيل لانهاء ما وصفته بـ"الممارسات المسيئة".
ويأتي التقرير في وقت يزور فيه وزير الدفاع الاميركي اشتون كارتر اسرائيل.
وبحسب التقرير، فان "قوات الامن الاسرائيلية استخدمت القوة غير المبررة لاعتقال أطفال فلسطينيين". وتحدث عن تفاصيل تتعلق "بعمليات اعتقال مسيئة" لستة اطفال.
واشار التقرير الى ان قوات الامن الاسرائيلية "قامت بخنق الاطفال والقاء القنابل الصاعقة عليهم وضربهم اثناء الاحتجاز وتهديدهم واستجوابهم في غياب آبائهم او محاميهم، كما اخفقت في اخطار آبائهم بمكانهم"."""

live_newsStoryTopic, live_newsStoryProbability =  newsTopicsModel.transform([live_newsStory])

In [None]:
live_newsStoryTopic, live_newsStoryProbability

([11], array([0.86207827]))

In [None]:
newsTopicsModel.get_topic_info(11)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,11,313,11_الاحتلال_الفلسطينية_الفلسطيني_الإسرائيلية,"[الاحتلال, الفلسطينية, الفلسطيني, الإسرائيلية,...","[الطفل الفلسطيني, رضيع فلسطيني, الفلسطينيين, ا...",[قال مسؤولو أمن إسرائيليون إن من يشتبه بأنهم م...


In [None]:
newsTopicsModel.get_topic(live_newsStoryTopic[0])

[('الاحتلال', 0.03317253631508505),
 ('الفلسطينية', 0.031008158833448807),
 ('الفلسطيني', 0.030844612447997372),
 ('الإسرائيلية', 0.02829672489649867),
 ('الإسرائيلي', 0.026260266970824578),
 ('إسرائيل', 0.025194636375340175),
 ('الفلسطينيين', 0.02272875953910997),
 ('المستوطنين', 0.022203965228467227),
 ('القدس', 0.02177069814949513),
 ('الضفة', 0.02086482368739133)]

In [None]:
live_englishStory = "Love is the emblem of eternity; it confounds all notion of time; effaces all memory of a beginning, all fear of an end."

live_englishStoryTopic, live_englishStoryProbability =  englishTopicsModel.transform(live_englishStory)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-02-05 05:25:15,984 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-02-05 05:25:20,167 - BERTopic - Dimensionality - Completed ✓
2024-02-05 05:25:20,169 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-02-05 05:25:20,173 - BERTopic - Cluster - Completed ✓


In [None]:
live_englishStoryTopic, live_englishStoryProbability

([13], array([0.56068123]))

In [None]:
englishTopicsModel.get_topic_info(13)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,13,392,13_love love_loved_romantic_love never,"[love love, loved, romantic, love never, lovin...","[love love, love loving, love loved, love some...",[love is not love that alters when it alterati...


In [None]:
englishTopicsModel.get_topic(live_englishStoryTopic[0])

[('love love', 0.060299994557520095),
 ('loved', 0.04096729819270148),
 ('romantic', 0.026763697914217604),
 ('love never', 0.026572415975426335),
 ('loving', 0.025080890235693577),
 ('romance', 0.023203469497706967),
 ('true love', 0.018823841867006006),
 ('love always', 0.018518690711762435),
 ('love one', 0.017298690947559946),
 ('love life', 0.015873163467224945)]

---

## Save and Reload

---



In [None]:
myModelId = "sentence-transformers/distiluse-base-multilingual-cased-v2"
newsTopicsModel.save("/content/POC/newsTopicsModel", serialization="safetensors",
                 save_ctfidf=True, save_embedding_model = myModelId)

newsTopicsLoadedModel = BERTopic.load("/content/POC/newsTopicsModel")

In [None]:
myModelId = "sentence-transformers/distiluse-base-multilingual-cased-v2"
englishTopicsModel.save("/content/POC/englishTopicsModel", serialization="safetensors",
                 save_ctfidf=True, save_embedding_model = myModelId)

englishTopicsLoadedModel = BERTopic.load("/content/POC/englishTopicsModel")