In [105]:
# IMPORTS

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from textblob import TextBlob

import re
import string

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# GENSIM
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [24]:
df = pd.read_csv('../data/dataset_twitter-scraper_2023-02-16_09-48-43-259.csv')

  df = pd.read_csv('../data/dataset_twitter-scraper_2023-02-16_09-48-43-259.csv')


In [13]:
df.head()

Unnamed: 0,conversation_id,created_at,favorite_count,full_text,hashtags/0,hashtags/1,hashtags/2,hashtags/3,hashtags/4,hashtags/5,...,user_mentions/47/id_str,user_mentions/47/name,user_mentions/47/screen_name,user_mentions/48/id_str,user_mentions/48/name,user_mentions/48/screen_name,user_mentions/49/id_str,user_mentions/49/name,user_mentions/49/screen_name,view_count
0,1625495400470958082,2023-02-14T14:01:18.000Z,410055,"Happy Valentine’s Day to the one and only, @Mi...",,,,,,,...,,,,,,,,,,16532336.0
1,1625143593286438912,2023-02-13T14:43:21.000Z,28215,"Congratulations to the Kansas City @Chiefs, Pa...",,,,,,,...,,,,,,,,,,2775676.0
2,1623489922438156288,2023-02-09T01:12:15.000Z,9131,@KingJames has been changing the game for 20 y...,,,,,,,...,,,,,,,,,,492644.0
3,1625903851168579585,2023-02-16T06:21:12.000Z,674,@RokoMijic !!,,,,,,,...,,,,,,,,,,443693.0
4,1622770990710235136,2023-02-07T01:35:28.000Z,13821,The scale of devastation after the earthquakes...,,,,,,,...,,,,,,,,,,3117489.0


#### Data preprocessing

Here we define a function that will help us pre-process our data, this includes cleaning the text data, tokenizing, and removing stop words, punctuation and special characters.

In [87]:
nltk.download('stopwords')
custom_stopwords = {'"', "'", "rt", "’", "“", "”", "…", "‘"}

def clean_text(text):
    # Remove URLs
    text = re.sub(r'https\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase 
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english')) | custom_stopwords
    text_tokens = word_tokenize(text)

    filtered_text = [word for word in text_tokens if word not in stop_words]

    return filtered_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tristan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Sentiment analysis

The first feature we want to extract from the text is sentiment score. The sentiment score will help us determine the setiment of a user towards a particular topic or a group of people, this will be a particularily helpful feature for the next part.

In [88]:
def get_sentiment(tokens):
    text = ' '.join(tokens)
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [89]:
# Cleaning the text data
df['clean_text'] = df['full_text'].apply(clean_text)

In [90]:
# getting the sentiment score for each Tweet
df['sentiment'] = df['clean_text'].apply(get_sentiment)

In [91]:
df.head(5)

Unnamed: 0,conversation_id,created_at,favorite_count,full_text,hashtags/0,hashtags/1,hashtags/2,hashtags/3,hashtags/4,hashtags/5,...,user_mentions/48/screen_name,user_mentions/49/id_str,user_mentions/49/name,user_mentions/49/screen_name,view_count,clean_text,sentiment,topic_distribution,topic,topic_probabilities
0,1625495400470958082,2023-02-14T14:01:18.000Z,410055,"Happy Valentine’s Day to the one and only, @Mi...",,,,,,,...,,,,,16532336.0,"[happy, valentine, day, one, make, every, day,...",0.8,"[(3, 0.3180154), (18, 0.07663138), (43, 0.0995...",49.0,"{3: 0.3179696, 18: 0.076635316, 43: 0.09959236..."
1,1625143593286438912,2023-02-13T14:43:21.000Z,28215,"Congratulations to the Kansas City @Chiefs, Pa...",,,,,,,...,,,,,2775676.0,"[congratulations, kansas, city, patrick, mahom...",0.526667,"[(23, 0.037654456), (59, 0.8535993), (80, 0.05...",59.0,"{23: 0.037670366, 59: 0.8535833, 80: 0.0554476}"
2,1623489922438156288,2023-02-09T01:12:15.000Z,9131,@KingJames has been changing the game for 20 y...,,,,,,,...,,,,,492644.0,"[changing, game, 20, years, become, leader, co...",-0.4,"[(1, 0.09189626), (8, 0.091813624), (42, 0.092...",98.0,"{1: 0.091896296, 8: 0.09181362, 42: 0.09238735..."
3,1625903851168579585,2023-02-16T06:21:12.000Z,674,@RokoMijic !!,,,,,,,...,,,,,443693.0,[],0.0,[],,
4,1622770990710235136,2023-02-07T01:35:28.000Z,13821,The scale of devastation after the earthquakes...,,,,,,,...,,,,,3117489.0,"[scale, devastation, earthquakes, türkiye, syr...",0.0,"[(24, 0.05606853), (65, 0.64960015), (70, 0.09...",65.0,"{24: 0.056062967, 65: 0.6494405, 70: 0.0932677..."


#### Topic modeling

Topic modeling will allow us to indentify topics or themes in our corpus. Combined with the setiment analysis, we should be able to determine which topics the tweet is negative or psotive about.
Since our text is already pre-processed, there is no need to process it any further.

In [92]:
# Storing all the processed text in a variable
processed_texts = df['clean_text']

# Create a dictionary of terms
dictionary = Dictionary(processed_texts)

# Create a corpus of documents
corpus = [dictionary.doc2bow(text) for text in processed_texts]

In [95]:
# Training the LDA model
lda_model = LdaModel(corpus= corpus, num_topics= 89, id2word= dictionary)

In [96]:
for i, topic in lda_model.show_topics(num_topics=10, num_words=10, formatted=False):
    print('Topic {}: {}'.format(i, ' '.join([w[0] for w in topic])))

Topic 54: stream censorship tweets concerns accountability normal decided rules trouble blame
Topic 28: believe biden enough doesnt chinese stay americans worth matter watching
Topic 71: even important one anything wont loved fully i… opinion ones
Topic 74: amp book woke system shows heard today clearly wish brand
Topic 88: idea arrested tech race hey big criminal general electric seem
Topic 23: going oh isnt right andrew tate food hold id price
Topic 27: see future taking hes could greatest rest link stupid industry
Topic 24: stop 10 republicans try security policy held brought cut helping
Topic 17: days sign set level w despite 26 prince poor success
Topic 81: public two called gender school information completely example governor private


Evaluating the LDA Model

In [97]:
# Calculate perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))

Perplexity:  -34.02880123662646


In [98]:
# Calculate coherence
coherence_model_lda = CoherenceModel(model= lda_model, texts= processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence: ', coherence_lda)

Coherence:  0.4727003262469014


Now that our LDA model is trained, we will create a new column in our dataframe to store the topic distribution for each Tweet.

In [99]:
# get the topic distributions for each document
doc_topic_dists = lda_model.get_document_topics(corpus)

# create a new column in the dataframe to store the topic distribution for each document
df['topic_distribution'] = doc_topic_dists

# extract the most likely topic for each document
df['topic'] = [max(doc, key=lambda item: item[1])[0] if doc else None for doc in doc_topic_dists]

# extract the topic probabilities for each document
df['topic_probabilities'] = [dict(doc) if doc else None for doc in doc_topic_dists]

In [100]:
df.head(10)

Unnamed: 0,conversation_id,created_at,favorite_count,full_text,hashtags/0,hashtags/1,hashtags/2,hashtags/3,hashtags/4,hashtags/5,...,user_mentions/48/screen_name,user_mentions/49/id_str,user_mentions/49/name,user_mentions/49/screen_name,view_count,clean_text,sentiment,topic_distribution,topic,topic_probabilities
0,1625495400470958082,2023-02-14T14:01:18.000Z,410055,"Happy Valentine’s Day to the one and only, @Mi...",,,,,,,...,,,,,16532336.0,"[happy, valentine, day, one, make, every, day,...",0.8,"[(77, 0.7816087), (80, 0.12063725)]",77,"{77: 0.77987033, 80: 0.12237568}"
1,1625143593286438912,2023-02-13T14:43:21.000Z,28215,"Congratulations to the Kansas City @Chiefs, Pa...",,,,,,,...,,,,,2775676.0,"[congratulations, kansas, city, patrick, mahom...",0.526667,"[(3, 0.110720865), (26, 0.05747989), (27, 0.05...",82,"{3: 0.11072106, 26: 0.057477057, 27: 0.0538410..."
2,1623489922438156288,2023-02-09T01:12:15.000Z,9131,@KingJames has been changing the game for 20 y...,,,,,,,...,,,,,492644.0,"[changing, game, 20, years, become, leader, co...",-0.4,"[(10, 0.069983415), (26, 0.10043924), (40, 0.0...",80,"{10: 0.069958955, 26: 0.10045136, 40: 0.091927..."
3,1625903851168579585,2023-02-16T06:21:12.000Z,674,@RokoMijic !!,,,,,,,...,,,,,443693.0,[],0.0,"[(0, 0.011235955), (1, 0.011235955), (2, 0.011...",0,"{0: 0.011235955, 1: 0.011235955, 2: 0.01123595..."
4,1622770990710235136,2023-02-07T01:35:28.000Z,13821,The scale of devastation after the earthquakes...,,,,,,,...,,,,,3117489.0,"[scale, devastation, earthquakes, türkiye, syr...",0.0,"[(1, 0.054576986), (15, 0.051483113), (39, 0.5...",39,"{1: 0.05456632, 15: 0.051481012, 39: 0.5493744..."
5,1625725531336638464,2023-02-16T06:14:04.000Z,1319,@BillyM2k @cb_doge Well I would hope that most...,,,,,,,...,,,,,83061.0,"[well, would, hope, san, franciscans, agree, p...",0.0,"[(22, 0.15038377), (30, 0.14446957), (31, 0.12...",57,"{22: 0.15049213, 30: 0.14446954, 31: 0.1288767..."
6,1620905250742730756,2023-02-01T22:01:41.000Z,12351,At my last Black History Month celebration at ...,,,,,,,...,,,,,1133433.0,"[last, black, history, month, celebration, whi...",-0.013333,"[(5, 0.04804414), (12, 0.093949236), (13, 0.06...",46,"{5: 0.048043806, 12: 0.09394804, 13: 0.0655410..."
7,1620905250742730756,2023-02-01T22:01:41.000Z,45433,Black History Month is about the shared experi...,,,,,,,...,,,,,3302865.0,"[black, history, month, shared, experience, bl...",-0.111111,"[(12, 0.15354958), (26, 0.07779298), (28, 0.07...",60,"{12: 0.15354933, 26: 0.077792995, 28: 0.077791..."
8,1626097497109311495,2023-02-16T06:07:30.000Z,633,"@wongmjane Yeah, it would be *crazy* to make a...",,,,,,,...,,,,,82777.0,"[yeah, would, crazy, make, ai, like, irl]",-0.6,"[(31, 0.4054175), (44, 0.14446111), (47, 0.168...",31,"{31: 0.40514112, 44: 0.1444611, 47: 0.16917141..."
9,1626097497109311495,2023-02-16T05:57:38.000Z,11043,Sounds eerily like the AI in System Shock that...,,,,,,,...,,,,,1845927.0,"[sounds, eerily, like, ai, system, shock, goes...",0.0,"[(13, 0.3481298), (19, 0.1588142), (53, 0.1016...",13,"{13: 0.34824863, 19: 0.1587027, 53: 0.10160482..."


In [70]:
def plot_coherence_perplexity(texts, corpus, dictionary, start=2, limit=100, step=1):
    coherence_scores = []
    perplexity_scores = []
    for num_topics in range(start, limit, step):
        lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        perplexity_scores.append(lda_model.log_perplexity(corpus))
        print('Number of Topics:', num_topics, '  Coherence Score:', coherence_score, '  Perplexity Score:', lda_model.log_perplexity(corpus))
    
    # plot the coherence and perplexity scores
    x = range(start, limit, step)
    fig, ax1 = plt.subplots()
    ax1.plot(x, coherence_scores, color='blue')
    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('Coherence Score', color='blue')
    ax2 = ax1.twinx()
    ax2.plot(x, perplexity_scores, color='red')
    ax2.set_ylabel('Perplexity Score', color='red')
    plt.show()

In [72]:
plot_coherence_perplexity(processed_texts, corpus, dictionary)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Tristan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Tristan\AppData\Local\Temp\ipykernel_10044\448380629.py", line 1, in <module>
    plot_coherence_perplexity(processed_texts, corpus, dictionary)
  File "C:\Users\Tristan\AppData\Local\Temp\ipykernel_10044\3499039388.py", line 5, in plot_coherence_perplexity
    lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
  File "C:\Users\Tristan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\gensim\models\ldamodel.py", line 521, in __init__
    self.update(corpus, chunks_as_numpy=use_numpy)
  File "C:\Users\Tristan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8

89 n topics looks good

In [106]:
# Convert a string representation of a topic distribution to a NumPy array
def parse_topic_distribution(topic_dist_str):
    topic_probs = np.zeros(100)  # Assumes there are 100 topics
    for topic, prob in eval(topic_dist_str):
        topic_probs[topic] = prob
    return topic_probs

#### Classification

Now that we have processed the data to produce relevant features, we want to train a machine learning algorithm using those features


In [110]:
df['topic_distribution'] = df['topic_distribution'].astype('object')
# Create a new column in the dataframe with the topic distributions as NumPy arrays
df['topic_probs'] = df['topic_distribution'].apply(parse_topic_distribution)

TypeError: eval() arg 1 must be a string, bytes or code object

In [104]:
# Define the features and target variable
X = df[['topic_distribution', 'sentiment']]
y = df['topic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and fit the SVM model
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# Make predictions on the testing data and evaluate the performance of the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

ValueError: setting an array element with a sequence.