In [2]:
import json
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Function to load data from a JSONL file
def load_data(file_path):
    tweets = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            tweets.append(tweet['text'])  # Assuming 'text' is the field containing the tweet text
    return tweets

# Function to preprocess text
def preprocess_texts(texts):
    stop_words = stopwords.words('english')
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Update these file paths according to your data files' location
file_paths = [
    r'C:\Users\Taraneh\OneDrive - North Dakota University System\Desktop\New folder (3)\TweetData_JSONL_Starting2020_04_01_To2020_04_30_Retrieved27082020_By1133.jsonl',
    r'C:\Users\Taraneh\OneDrive - North Dakota University System\Desktop\New folder (3)\TweetData_JSONL_Starting2020_05_01_To2020_05_31_Retrieved27082020_By1136.jsonl',
    r'C:\Users\Taraneh\OneDrive - North Dakota University System\Desktop\New folder (3)\TweetData_JSONL_Starting2020_06_01_To2020_06_30_Retrieved27082020_By1137.jsonl',
    r'C:\Users\Taraneh\OneDrive - North Dakota University System\Desktop\New folder (3)\TweetData_JSONL_Starting2020_07_01_To2020_07_31_Retrieved27082020_By1138.jsonl',
    r'C:\Users\Taraneh\OneDrive - North Dakota University System\Desktop\New folder (3)\TweetData_JSONL_Starting2020_08_01_To2020_08_27_Retrieved27082020_By1139.jsonl'
]

tweets = []
for file_path in file_paths:
    tweets.extend(load_data(file_path))

preprocessed_texts = preprocess_texts(tweets)

# Create Dictionary and Corpus
id2word = corpora.Dictionary(preprocessed_texts)
corpus = [id2word.doc2bow(text) for text in preprocessed_texts]

# Parameters for LDA
num_topics = 3
alpha = 0.01
eta = 0.01
passes = 50

# Train LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=100,
                                       passes=passes,
                                       alpha=alpha,
                                       eta=eta)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

# Explore the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Taraneh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Coherence Score: 0.3258075286277704
Topic: 0 
Words: 0.090*"co" + 0.089*"https" + 0.025*"covid" + 0.017*"uber" + 0.008*"coronavirus" + 0.007*"lyft" + 0.007*"flight" + 0.005*"bus" + 0.005*"bike" + 0.005*"airline"
Topic: 1 
Words: 0.089*"co" + 0.089*"https" + 0.038*"uber" + 0.023*"covid" + 0.020*"drivers" + 0.016*"bike" + 0.016*"coronavirus" + 0.011*"lyft" + 0.011*"face" + 0.010*"masks"
Topic: 2 
Words: 0.090*"co" + 0.090*"https" + 0.029*"covid" + 0.028*"airline" + 0.013*"aviation" + 0.013*"industry" + 0.010*"airport" + 0.010*"news" + 0.009*"latest" + 0.008*"keep"


In [3]:
import re
import json
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Improved preprocessing function
def preprocess_texts(texts):
    stop_words = stopwords.words('english')
    # Additional stopwords can be added to this list
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    texts_out = []
    for text in texts:
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r"\'", "", text)  # Remove single quotes
        text = gensim.utils.simple_preprocess(str(text), deacc=True)  # De-accent and simple preprocess
        texts_out.append([word for word in text if word not in stop_words and len(word) > 3])  # Remove stopwords and short words
    return texts_out

# Load data (assuming tweets have been loaded into the `tweets` list as before)
preprocessed_texts = preprocess_texts(tweets)

# Create Dictionary and Corpus, including filtering extremes
id2word = corpora.Dictionary(preprocessed_texts)
id2word.filter_extremes(no_below=5, no_above=0.5)  # Filter out words in less than 5 documents or in more than 50% of the documents
corpus = [id2word.doc2bow(text) for text in preprocessed_texts]

# Train LDA model with new parameters
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5,  # Trying a different number of topics
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,  # Reduced number of passes for quicker iteration
                                       alpha='asymmetric',  # Different alpha setting
                                       eta='auto')  # Auto-tune eta

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

# Explore the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Taraneh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Coherence Score: 0.37193319898806626
Topic: 0 
Words: 0.049*"covid" + 0.029*"airline" + 0.017*"flight" + 0.015*"pandemic" + 0.014*"coronavirus" + 0.013*"aviation" + 0.013*"industry" + 0.011*"news" + 0.011*"subway" + 0.011*"keep"
Topic: 1 
Words: 0.079*"uber" + 0.046*"covid" + 0.036*"drivers" + 0.029*"lyft" + 0.022*"coronavirus" + 0.017*"mask" + 0.015*"masks" + 0.013*"wear" + 0.013*"driver" + 0.012*"service"
Topic: 2 
Words: 0.056*"covid" + 0.039*"airline" + 0.034*"thank" + 0.022*"critical" + 0.020*"emergency" + 0.020*"fund" + 0.019*"demand" + 0.018*"whitehouse" + 0.018*"cares" + 0.018*"uscongress"
Topic: 3 
Words: 0.055*"bike" + 0.028*"social" + 0.024*"ride" + 0.023*"free" + 0.021*"bicycles" + 0.021*"distancing" + 0.019*"exercise" + 0.018*"change" + 0.017*"people" + 0.017*"great"
Topic: 4 
Words: 0.043*"uber" + 0.029*"similar" + 0.028*"tesla" + 0.028*"market" + 0.027*"university" + 0.026*"disrupted" + 0.026*"phoenix" + 0.022*"consider" + 0.020*"services" + 0.020*"suggest"
