In [None]:
# Exercise sheet 8 with the  biden.csv tweet before election. Load the data set into your console.

# Task 1
# In moodle you will find the file biden.csv. It contains every tweet in the month prior to the U.S. presidential election in 2020 containing the hash tag #joebiden. 
# Load the file into your console. It contains each tweet in the “tweet” column and the date of the tweet’s creation in the “created at” column.

# We are interested in how the topics of the tweets develop over time. For this, we will train a dynamic topic model called RollingLDA on the speeches and compare the resulting topical changes in the following tasks.

In [2]:
import pandas as pd

# Load the dataset
file_path = '/Users/oayanwale/Downloads/NLP_Exercise_24_25/Data/biden.csv'
biden_df = pd.read_csv(file_path)

# Display the first few rows of the dataframe and check relevant columns
print(biden_df[['tweet', 'created_at']].head())

                                               tweet           created_at
0  #ElectionNight #MSNBC2020 #IVoted #Biden2020 #...  2020-11-03 23:41:02
1  Go Headlines: #TopNews Of The Hour\n#USElectio...  2020-11-04 09:10:42
2  I doubt the person(s) who stole our official B...  2020-10-18 13:25:15
3  The Bidens are safe so long as Fox News is the...  2020-10-21 09:30:52
4  Since I live in a republican state TIME FOR ME...  2020-11-07 17:58:18


In [4]:
# option 2 loading data

import pandas as pd

# Define file path
data_folder = "/Users/oayanwale/Downloads/NLP_Exercise_24_25/Data/"
biden_path = f"{data_folder}/biden.csv"

# Load dataset
biden_df = pd.read_csv(biden_path, parse_dates=["created_at"])  # Ensure date is in correct format

# Display basic info
print(biden_df.head())
print(biden_df.info())


           created_at                                              tweet
0 2020-11-03 23:41:02  #ElectionNight #MSNBC2020 #IVoted #Biden2020 #...
1 2020-11-04 09:10:42  Go Headlines: #TopNews Of The Hour\n#USElectio...
2 2020-10-18 13:25:15  I doubt the person(s) who stole our official B...
3 2020-10-21 09:30:52  The Bidens are safe so long as Fox News is the...
4 2020-11-07 17:58:18  Since I live in a republican state TIME FOR ME...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129996 entries, 0 to 129995
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   created_at  129996 non-null  datetime64[ns]
 1   tweet       129996 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 2.0+ MB
None


# Task 2
# Remove unwanted fragments that are not relevant for our analysis. 
# Preprocess the texts so that they are fit for an analysis. 
# Argue the use the preprocessing steps you take for the given analysis.

In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Ensure necessary resources are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation and numbers, keeping only alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Tokenize (split into words)
    words = text.split()
    
    # Remove stopwords and lemmatize remaining words
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(processed_words)

# Apply preprocessing to tweet texts column (assuming it's named 'tweet')
biden_df['processed_tweet'] = biden_df['tweet'].apply(preprocess_text)

# Display processed tweets for verification
print(biden_df[['tweet', 'processed_tweet']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/oayanwale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               tweet  \
0  #ElectionNight #MSNBC2020 #IVoted #Biden2020 #...   
1  Go Headlines: #TopNews Of The Hour\n#USElectio...   
2  I doubt the person(s) who stole our official B...   
3  The Bidens are safe so long as Fox News is the...   
4  Since I live in a republican state TIME FOR ME...   

                                     processed_tweet  
0  electionnight msnbc ivoted biden bluewave vote...  
1  go headline topnews hour uselection trump accu...  
2  doubt person stole official biden harris sign ...  
3  bidens safe long fox news news outlet reportin...  
4  since live republican state time start harassi...  


# Since we want meaningful topic modeling, we must preprocess:
✅ Remove non-text fragments (RT @user, links, hashtags). 
{which are often not relevant for sentiment analysis or topic modeling since they don't contribute to the semantic meaning}.

✅ Convert to lowercase. 
{ensures uniformity in word representation (e.g., "Vote" and "vote" are treated as the same word)}.

✅ Remove punctuation, numbers, and special characters. 
{were effectively stripped out, leading to cleaner tokens that focus on meaningful words}.

✅ Tokenize, remove stopwords, and lemmatize words. 
{The transformation into tokenized words allows for easier manipulation during analysis, especially when using models like LDA that require document-term matrices.}

{Common stop words (like "the", "is", "and") were likely filtered out. This is crucial because such words do not add significant meaning to topics or sentiments being analyzed.}

✅ Lemmatization: Reducing words to their base forms helps in consolidating different variations of a word (e.g., "running" becomes "run"), which is important for accurately capturing themes in topic modeling.

In [18]:
# option 2 

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure required downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)  # Remove URLs, mentions, hashtags
    text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation & numbers
    words = word_tokenize(text)  # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords & lemmatize
    #return " ".join(words)
    return words  # RETURN LIST (not a string) for rollinglda

# Apply preprocessing
biden_df["processed_tweet"] = biden_df["tweet"].apply(preprocess_text)

# Print sample
print(biden_df[["tweet", "processed_tweet"]].head())



[nltk_data] Downloading package punkt to /Users/oayanwale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               tweet  \
0  #ElectionNight #MSNBC2020 #IVoted #Biden2020 #...   
1  Go Headlines: #TopNews Of The Hour\n#USElectio...   
2  I doubt the person(s) who stole our official B...   
3  The Bidens are safe so long as Fox News is the...   
4  Since I live in a republican state TIME FOR ME...   

                                     processed_tweet  
0                                                 []  
1  [go, headline, hour, accuses, campaign, fraud,...  
2  [doubt, person, stole, official, bidenharris, ...  
3  [bidens, safe, long, fox, news, news, outlet, ...  
4  [since, live, republican, state, time, start, ...  


# Arguing for These Preprocessing Steps
When arguing for our preprocessing steps in an analysis context, consider emphasizing:

Relevance to Objectives: Each step taken aligns with our goal of analyzing emotions or topics within tweets by focusing on meaningful content while removing noise.

Model Efficiency: By reducing dimensionality through removing irrelevant fragments and stop words, we improve computational efficiency and model performance.

Data Quality Improvement: Effective preprocessing enhances data quality by ensuring that only relevant information is retained for analysis, leading to more accurate insights.

Conclusion
Overall, our preprocessing steps appear well-suited for preparing tweet data for further analysis using techniques like LDA or sentiment classification.

# Task 3
# Train a normal LDA on the entire corpus with K = 30.


In [7]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [38]:
# Train normal LDA using Gensim

from gensim import corpora, models

# Convert processed tweets into lists of tokenized words.
tokenized_texts = [text.split() for text in biden_df['processed_tweet']]

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(tokenized_texts)

# Create a bag-of-words corpus from dictionary representation.
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Train an LDA model with K=30 topics.
lda_model_normal = models.LdaModel(corpus, num_topics=30, id2word=dictionary, passes=200)

# Display topics found by the model:
topics_normal = lda_model_normal.print_topics(num_words=5)
for topic in topics_normal:
    print(topic)

(12, '0.261*"#" + 0.157*"election" + 0.055*"#joebiden" + 0.041*"#biden" + 0.023*"#china"')
(7, '0.184*"president" + 0.071*"#joebiden" + 0.041*"first" + 0.038*"u" + 0.038*"vice"')
(29, '0.093*"good" + 0.066*"love" + 0.054*"call" + 0.052*"hope" + 0.051*"#joebiden"')
(19, '0.077*"really" + 0.050*"@" + 0.043*"bye" + 0.040*"fact" + 0.032*"hate"')
(27, '0.270*"amp" + 0.039*"black" + 0.035*"called" + 0.023*"number" + 0.022*"actually"')
(18, '0.078*"news" + 0.051*"medium" + 0.039*"#biden" + 0.033*"war" + 0.031*"#byedon"')
(3, '0.052*"big" + 0.044*"family" + 0.043*"become" + 0.037*"got" + 0.036*"around"')
(4, '0.073*"elected" + 0.071*"#biden" + 0.070*"#vote" + 0.053*"#bidenharris" + 0.047*"winner"')
(26, '0.121*"usa" + 0.046*"india" + 0.037*"#bidenharristoendthisnightmare" + 0.032*"never" + 0.028*"st"')
(16, '0.123*"#uselection" + 0.099*"#joebiden" + 0.077*"#donaldtrump" + 0.059*"#usa" + 0.058*"#america"')
(0, '0.081*"u" + 0.065*"election" + 0.061*"trump" + 0.053*"#biden" + 0.053*"#joebiden"')


# Task 4
# Train a RollingLDA on the corpus. Set the time chunk length to three days and choose K = 30. If this takes a lot of time, chose prototype=1 and lower the epoch count.

In [9]:
pip install ttta


Defaulting to user installation because normal site-packages is not writeable
Collecting ttta
  Downloading ttta-0.9.5.tar.gz (90 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting Cython (from ttta)
  Using cached Cython-3.0.11-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting spacy (from ttta)
  Downloading spacy-3.8.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting xmltodict (from ttta)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting transformers>=4.46.3 (from ttta)
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
Collecting torch (from ttta)
  Downloading torch-2.6.0-cp39-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting HanTa (from ttta)
  Downloading HanTa-1.1.1-py3-none-any.whl.metadata (3.4 kB)
Collecting wasabi (from ttta)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)

In [16]:
import pandas as pd
import numpy as np
from ttta.methods.rolling_lda import RollingLDA
from gensim import corpora
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package punkt to /Users/oayanwale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oayanwale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
help(RollingLDA)

Help on class RollingLDA in module ttta.methods.rolling_lda:

class RollingLDA(builtins.object)
 |  RollingLDA(K: int, how: Union[str, List[datetime.datetime]] = 'ME', warmup: int = 48, memory: int = 3, alpha: float = None, gamma: float = None, initial_epochs: int = 100, subsequent_epochs: int = 50, min_count: int = 2, max_assign=False, prototype: int = 10, topic_threshold: List[Union[int, float]] = None, prototype_measure: Union[str, Callable] = 'jaccard', lda: ttta.methods.lda_prototype.LDAPrototype = None, min_docs_per_chunk: int = None, verbose: int = 1, seed: Union[int, numpy.uint32] = None) -> None
 |  
 |  Implements a rolling LDA model for diachronic topic modeling.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, K: int, how: Union[str, List[datetime.datetime]] = 'ME', warmup: int = 48, memory: int = 3, alpha: float = None, gamma: float = None, initial_epochs: int = 100, subsequent_epochs: int = 50, min_count: int = 2, max_assign=False, prototype: int = 10, topic_thresho

In [28]:
from ttta.methods import rolling_lda

# Set parameters for Rolling LDA
time_chunk_length_days = 3  # Length of time chunk in days
how = f'{time_chunk_length_days}D'  # Use '3D' for three-day chunks

# Check how many unique dates are in your dataset
unique_dates = biden_df['created_at'].nunique()
print("Unique dates in dataset:", unique_dates)

# Adjust warmup if necessary based on unique dates
rolling_lda_model = rolling_lda.RollingLDA(K=30,
                                            how=how,  
                                            warmup=min(5, unique_dates - 1),  # Ensure warmup is less than available chunks
                                            memory=3)

# Fit the model using the entire DataFrame and specify column names
rolling_lda_model.fit(biden_df, 
                       workers=1, 
                       text_column='processed_tweet', 
                       date_column='created_at')

Unique dates in dataset: 117243


Processing 8937 documents in chunk 2020-10-30: : 0chunk [00:00, ?chunk/s]


IndexError: list index out of range

In [32]:
# Check if biden_df has any rows
print("Number of tweets:", len(biden_df))

Number of tweets: 129996


In [33]:
# Check processed_tweet column for any entries
print("Processed Tweets Sample:\n", biden_df['processed_tweet'].head())

Processed Tweets Sample:
 86581                                                   []
10717    [twitter, manipulate, u, election, favor, ccp,...
11909    [fbi, allegedly, obtained, hunter, biden, comp...
69045    [isnt, sellout, long, black, people, going, vo...
34453               [im, going, share, thing, like, biden]
Name: processed_tweet, dtype: object


In [34]:
# Find indices of empty processed tweets
empty_processed_indices = biden_df[biden_df['processed_tweet'].map(len) == 0].index

# Display the original tweets corresponding to these indices
empty_tweets = biden_df.loc[empty_processed_indices]
print("Original Tweets with Empty Processed Results:\n", empty_tweets[['tweet', 'processed_tweet']])

Original Tweets with Empty Processed Results:
                                                     tweet processed_tweet
86581   #censorship #HunterBiden #Biden #BidenEmails #...              []
88313            #IOWA FOR #BIDEN https://t.co/sZbgS5DSVg              []
72578   @JoeBiden #Truth #COVID19 #Trump #Biden #Harri...              []
93156   #trump #biden #nypost #HunterBiden #ElectionTw...              []
57510   #Biden/Harris2020 \n#VoteLikeYourLifeDependsOn...              []
...                                                   ...             ...
77096     #MAGA #BIDEN #TRUMPLOST https://t.co/eGHaSPnqHL              []
10383   #USElectionResults2020 #USElection #JoeBiden #...              []
61998   https://t.co/9pTKZHPxed\n\n#Election2020result...              []
129808  🙏🙌🤩🤩 #Biden #Harris \n#USAElections2020 🇺🇸 htt...              []
63212   @realDonaldTrump #Celebration #Biden https://t...              []

[7722 rows x 2 columns]


# Problem: 7,722 tweets with empty processed results
The original tweets contain hashtags, mentions, and links, which are likely being removed during preprocessing. This is why the processed text results in empty lists.

# Solution
adjust preprocessing function to ensure that meaningful content is retained even when it contains hashtags or mentions.

1. Retain Hashtags and Mentions: Instead of removing all non-alphabetic characters, you could modify your regex to keep hashtags and mentions as they may carry sentiment or meaning.

2. Custom Stop Words: If certain words are being filtered out but are important (like "u"), consider adding them back into your processing logic.


# Adjustment to Task 2

In [35]:
# # Adjustment to Task 2: Revised Preprocessing Function

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs but retain hashtags and mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    
    # Keep hashtags and mentions; only remove punctuation (except for '#' and '@')
    text = re.sub(r'[^a-zA-Z\s#@]', ' ', text)
    
    # Tokenize (split into words)
    words = text.split()
    
    # Remove stopwords while keeping meaningful terms like 'u'
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(processed_words)

# Reapply preprocessing after adjustments
biden_df['processed_tweet'] = biden_df['tweet'].apply(preprocess_text)

# Verify again after reprocessing
print(biden_df[['tweet', 'processed_tweet']].head())

                                                   tweet  \
86581  #censorship #HunterBiden #Biden #BidenEmails #...   
10717  In 2020, #NYPost is being #censorship #CENSORE...   
11909  FBI Allegedly Obtained Hunter Biden Computer, ...   
69045  #IceCube isn’t a sellout how long are black pe...   
34453  I’m going to share things I like about Biden m...   

                                         processed_tweet  
86581  #censorship #hunterbiden #biden #bidenemails #...  
10717  #nypost #censorship #censored twitter manipula...  
11909  fbi allegedly obtained hunter biden computer d...  
69045  #icecube sellout long black people going vote ...  
34453  going share thing like biden #bidencares #bide...  


In [36]:
# Check processed_tweet column for any entries
print("Processed Tweets Sample:\n", biden_df['processed_tweet'].head())

Processed Tweets Sample:
 86581    #censorship #hunterbiden #biden #bidenemails #...
10717    #nypost #censorship #censored twitter manipula...
11909    fbi allegedly obtained hunter biden computer d...
69045    #icecube sellout long black people going vote ...
34453    going share thing like biden #bidencares #bide...
Name: processed_tweet, dtype: object


# rerun Task 3: Train a Normal LDA Model above 
# now we will rerun Task 4 

In [20]:
# Count total number of chunks
biden_df["days_since_start"] = (biden_df["created_at"] - biden_df["created_at"].min()).dt.days
total_chunks = biden_df["days_since_start"].max() // 3  # Divide by 3 to get 3-day chunks

print(f"Total available time chunks: {total_chunks}")


Total available time chunks: 8


In [30]:
print(biden_df["processed_tweet"].apply(type).value_counts())  # Should all be <class 'list'>
print(biden_df["processed_tweet"].head())  # Check if they contain lists of words


processed_tweet
<class 'list'>    129996
Name: count, dtype: int64
86581                                                   []
10717    [twitter, manipulate, u, election, favor, ccp,...
11909    [fbi, allegedly, obtained, hunter, biden, comp...
69045    [isnt, sellout, long, black, people, going, vo...
34453               [im, going, share, thing, like, biden]
Name: processed_tweet, dtype: object


The error message TypeError: The elements of the 'texts' column of texts must each contain a tokenized document as a list of strings! indicates that the Rolling LDA model expects the processed_tweet column in your DataFrame to contain lists of tokens (i.e., words), but it seems it is currently formatted as a single string.

Steps to Fix
Ensure Tokenization: When you preprocess your tweets, make sure that the processed text is stored as a list of words instead of a single string.
Update Preprocessing Function: Modify the preprocessing function so that it retains the tokenized format.


In [56]:
# Load the dataset (biden.csv)
file_path = '/Users/oayanwale/Downloads/NLP_Exercise_24_25/Data/biden.csv'
biden_df = pd.read_csv(file_path)

# Ensure 'created at' is in datetime format
biden_df['created_at'] = pd.to_datetime(biden_df['created_at'])

# Preprocess tweets as previously defined.
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs and punctuation/numbers, keeping only alphabetic characters and spaces
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Keep only letters and spaces
    
    # Tokenize (split into words)
    words = text.split()
    
    # Remove stopwords and lemmatize remaining words
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return processed_words  # Return as a list of tokens

# Apply preprocessing to tweet texts column (assuming it's named 'tweet')
biden_df['processed_tweet'] = biden_df['tweet'].apply(preprocess_text)

# Check if processed_tweet contains lists of tokens correctly formatted:
print(biden_df[['tweet', 'processed_tweet']].head())


                                               tweet  \
0  #ElectionNight #MSNBC2020 #IVoted #Biden2020 #...   
1  Go Headlines: #TopNews Of The Hour\n#USElectio...   
2  I doubt the person(s) who stole our official B...   
3  The Bidens are safe so long as Fox News is the...   
4  Since I live in a republican state TIME FOR ME...   

                                     processed_tweet  
0  [electionnight, msnbc, ivoted, biden, bluewave...  
1  [go, headline, topnews, hour, uselection, trum...  
2  [doubt, person, stole, official, biden, harris...  
3  [bidens, safe, long, fox, news, news, outlet, ...  
4  [since, live, republican, state, time, start, ...  


In [57]:
from ttta.methods import rolling_lda  # Ensure this imports correctly

# Set parameters for Rolling LDA
time_chunk_length_days = 3  # Length of time chunk in days

# Initialize Rolling LDA model with appropriate parameters.
rolling_lda_model = rolling_lda.RollingLDA(K=30,
                                            how=f'{time_chunk_length_days}D',  
                                            warmup=5,   
                                            memory=3)

# Fit the model using the entire DataFrame and specify column names 
rolling_lda_model.fit(biden_df, 
                       workers=1, 
                       text_column='processed_tweet', 
                       date_column='created_at')

# Display top terms from fitted model:
print("Topics from Rolling LDA model:")
for i in range(30):  # Assuming K=30 topics
    print(f"Topic {i + 1}: {rolling_lda_model.get_top_words(topic=i)}")

Processing 24653 documents in chunk 2020-11-02: : 1chunk [00:56, 56.39s/chunk]


TypeError: 'coo_matrix' object is not subscriptable

In [59]:
# Set parameters for Rolling LDA
time_chunk_length_days = 3  # Length of time chunk in days

# Initialize Rolling LDA model with appropriate parameters
rolling_lda_model = rolling_lda.RollingLDA(K=30,
                                           how=f'{time_chunk_length_days}D',  
                                           warmup=5,   
                                           memory=3)

# Helper function to convert the DTM to a dense format before fitting
def convert_dtm_to_dense(dtm):
    return dtm.todense()  # Convert to dense matrix for easier access

# Helper function to fit the model with conversion to dense DTM
def fit_rolling_lda_with_dense_dtm(model, data, workers, text_column, date_column):
    # Ensure DTM is in dense format before fitting
    model._dtm = convert_dtm_to_dense(model._dtm)
    
    # Fit the model on the data
    model.fit(data, workers=workers, text_column=text_column, date_column=date_column)
    
    # No need to convert DTM after fitting since it's already in dense format
    return model

# Fit the model
fit_rolling_lda_with_dense_dtm(rolling_lda_model, biden_df, workers=1, text_column='processed_tweet', date_column='created_at')

# Display top terms from fitted model:
print("Topics from Rolling LDA model:")
for i in range(30):  # Assuming K=30 topics
    print(f"Topic {i + 1}: {rolling_lda_model.get_top_words(topic=i)}")


AttributeError: 'RollingLDA' object has no attribute '_dtm'

In [None]:
import pandas as pd
import re
import nltk
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
# Ensure required NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


# Load the dataset
tweets_file = "/Users/user/Downloads/NLP/NLP_Exercise/Data/biden.csv"
tweets_data = pd.read_csv(tweets_file)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)  # Remove URLs, hashtags, mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens
# Apply preprocessing
tweets_data['clean_text'] = tweets_data['tweet'].apply(preprocess_text)
# Convert text into Bag of Words (BoW)
dictionary = Dictionary(tweets_data['clean_text'])
corpus = [dictionary.doc2bow(text) for text in tweets_data['clean_text']]

# Train LDA model
num_topics = 30
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)


# Print top words for each topic
def display_topics(model, num_words):
    for idx, topic in model.show_topics(formatted=False, num_words=num_words):
        print(f"Topic {idx+1}: {', '.join([word for word, _ in topic])}")
print("LDA Topics:")
display_topics(lda_model, 10)

# Plot topic distribution
topic_distribution = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
plt.hist(topic_distribution, bins=num_topics, alpha=0.75)
plt.xlabel("Topics")
plt.ylabel("Tweet Count")
plt.title("LDA Topic Distribution")
plt.show()
 





