In [1]:
import pandas as pd
import joblib
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [39]:
# Reading the data
df = pd.read_excel('test1.xlsx')
print(df.head(10))
df.info()

                                                Text  Likes  Retweets
0       The weather forecast predicts rain tomorrow.     84        57
1   Weather patterns can be unpredictable in spring.     19        92
2  The architecture of ancient Rome still influen...     63         8
3  Modern architecture often emphasizes sleek lin...     41        69
4  Coding skills are essential in today's digital...     95        36
5  Many students are learning coding languages to...     68        80
6  The weather in coastal regions can be quite wi...     27        43
7  The architecture of Gothic cathedrals is awe-i...     51        22
8  Learning about coding can open up a world of o...     11        76
9  Weather conditions are ideal for a picnic in t...      3        95
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      100 non-null    object
 1   Likes   

In [40]:
# Preprocessing the text
def preprocess_text(text):
    text = text.lower()
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    stop_words = stopwords.words('english')
    review_without_stop_words = ' '.join([word for word in text.split() if word not in stop_words])
    
    stemmer = SnowballStemmer("english")
    review_stemmed = ' '.join([stemmer.stem(word) for word in review_without_stop_words.split()])
    return review_stemmed

In [41]:
# Applying the preprocessing function to the text column
df['cleaned_text'] = df['Text'].apply(preprocess_text)
df['cleaned_text'].head(10)

0               weather forecast predict rain tomorrow
1                     weather pattern unpredict spring
2    architectur ancient rome still influenc modern...
3    modern architectur often emphas sleek line min...
4               code skill essenti today digit economi
5    mani student learn code languag prepar futur c...
6                    weather coastal region quit windi
7                 architectur gothic cathedr aweinspir
8                       learn code open world opportun
9                     weather condit ideal picnic park
Name: cleaned_text, dtype: object

In [42]:
# Creating the Document Term Matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000,stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])
tf_feature_names = vectorizer.get_feature_names_out()


In [43]:
# Applying LDA
no_topics = 6 #number of topics to consider 
no_top_words = 5 #number of topics to display for each topic
lda = LatentDirichletAllocation(n_components=no_topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0).fit(tfidf_matrix)


In [44]:
# Printing the topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[f"Topic {topic_idx}"] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]])
    return topics

In [45]:
display_topics(lda, tf_feature_names, no_top_words)

{'Topic 0': 'architectur gothic known character cathedr',
 'Topic 1': 'code learn opportun boot camp',
 'Topic 2': 'architectur pattern weather ancient influenc',
 'Topic 3': 'weather condit agricultur like code',
 'Topic 4': 'code skill problemsolv valuabl requir',
 'Topic 5': 'weather like forecast boot camp'}

In [59]:
model_folder = '../src/mlservice/service/utils'
no_topics = 5
lda = LatentDirichletAllocation(n_components=no_topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0)
joblib.dump(lda, f'{model_folder}/lda_trend_model.joblib')

['../src/mlservice/service/utils/lda_trend_model.joblib']

In [47]:
#Hast tag creator
def get_hashtag(model, feature_names, no_top_words):
  hashtag = ""
  for topic_idx,topic in enumerate(model.components_):
    top_words_indices = topic.argsort()[:-no_top_words - 1:-1]
    top_words = [feature_names[i].title() for i in top_words_indices]
    hashtag += f"#{''.join(top_words[:no_top_words])}"

  return hashtag.strip()

#Hash tag generator
def hash_tag_text_genetator(text):
  text = preprocess_text(text)
  vectorizer = CountVectorizer(max_features=1000,stop_words="english")
  matrix = vectorizer.fit_transform([text])
  tf_feature_names = vectorizer.get_feature_names_out()
  lda = LatentDirichletAllocation(n_components=1,max_iter=5,learning_method='online',learning_offset=50,random_state=0)
  lda.fit(matrix)
  return get_hashtag(lda, tf_feature_names, 3)

In [48]:
#Test the hash tag generator
text = ["This document explores various techniques for analyzing textual data, including sentiment analysis, topic modeling, and named entity recognition. It discusses the applications of these techniques in different fields like social media analysis, customer feedback processing, and information retrieval",
"This article investigates the use of machine learning algorithms in predicting stock market trends. It explores different algorithms like linear regression, support vector machines, and deep learning models for stock price forecasting. The article also discusses the challenges and limitations of using machine learning for financial prediction.",
"This research paper examines the potential effects of social media usage on mental health. It analyzes the correlation between social media engagement and symptoms of anxiety, depression, and loneliness. The paper suggests strategies for promoting responsible social media use and maintaining mental well-being.",
"This report discusses the booming e-commerce industry and its impact on traditional brick-and-mortar retail stores. It analyzes the factors contributing to the growth of online shopping, such as convenience, competitive pricing, and wider product selection. The report also explores how traditional retailers can adapt to this changing landscape.",
"This article delves into the potential applications of artificial intelligence (AI) in the healthcare sector. It discusses how AI can be used for tasks like medical diagnosis, drug discovery, and personalized medicine. The article also highlights the ethical considerations related to AI implementation in healthcare."]

for t in text:
  print(hash_tag_text_genetator(t))

#AnalysiTechniquLike
#MachinLearnAlgorithm
#MediaSocialMental
#RetailTraditIndustri
#AiArticlHealthcar


In [50]:
# Saving the model into joblib file
no_topics = 1
lda = LatentDirichletAllocation(n_components=no_topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0)
joblib.dump(lda, f'{model_folder}/lda_model.joblib')

['../src/mlservice/service/ml_models/lda_model.joblib']

In [60]:
#Hash tag generator with joblib files
def hash_tag_text_genetator(text):
    text = preprocess_text(text)
    vectorizer = joblib.load(f'{model_folder}/count_vectorizer.joblib')
    matrix = vectorizer.fit_transform([text])
    tf_feature_names = vectorizer.get_feature_names_out()
    lda = joblib.load(f'{model_folder}/lda_model.joblib')
    lda.fit(matrix)
    return get_hashtag(lda, tf_feature_names, 3)


In [61]:
#Test the hash tag generator
text = ["This document explores various techniques for analyzing textual data, including sentiment analysis, topic modeling, and named entity recognition. It discusses the applications of these techniques in different fields like social media analysis, customer feedback processing, and information retrieval",
"This article investigates the use of machine learning algorithms in predicting stock market trends. It explores different algorithms like linear regression, support vector machines, and deep learning models for stock price forecasting. The article also discusses the challenges and limitations of using machine learning for financial prediction.",
"This research paper examines the potential effects of social media usage on mental health. It analyzes the correlation between social media engagement and symptoms of anxiety, depression, and loneliness. The paper suggests strategies for promoting responsible social media use and maintaining mental well-being.",
"This report discusses the booming e-commerce industry and its impact on traditional brick-and-mortar retail stores. It analyzes the factors contributing to the growth of online shopping, such as convenience, competitive pricing, and wider product selection. The report also explores how traditional retailers can adapt to this changing landscape.",
"This article delves into the potential applications of artificial intelligence (AI) in the healthcare sector. It discusses how AI can be used for tasks like medical diagnosis, drug discovery, and personalized medicine. The article also highlights the ethical considerations related to AI implementation in healthcare."]

for t in text:
  print(hash_tag_text_genetator(t))

#AnalysiLikeTechniqu
#LearnMachinArticl
#MediaSocialMental
#ReportRetailTradit
#AiArticlImplement
