In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis 
import pyLDAvis.sklearn 

  from imp import reload


In [2]:
tweets_df = pd.read_csv('data/tweets_cleaned.csv')
tweets_df.head()

Unnamed: 0,datetime,tweet_id,text,username,like_count,display_name,lang
0,2022-07-15 23:56:53+00:00,1548094282900221953,government considering creating second state o...,TimesLIVE,85,Times LIVE,en
1,2022-07-15 23:50:41+00:00,1548092722166132738,way fix current energy crisis splitting eskom ...,KingTNgema,0,Born A King,en
2,2022-07-15 23:43:00+00:00,1548090785450434566,awarded pushed zuma brian molefe guptas mine g...,Constitution_94,0,Constitution First 🇿🇦,en
3,2022-07-15 23:39:22+00:00,1548089872287555584,use gold reserve sort eskom sa debt market fac...,mdange39,0,Tupac,en
4,2022-07-15 23:33:36+00:00,1548088419787517957,eskom really realigning sleep schedule,kokovee,0,In ❤ with the Koko,en


In [3]:
bow_vectorizer = CountVectorizer(max_df=0.5, min_df=10, stop_words=['south', 'africa', 'african', 'sa', '00', '30', '10', '12', 'get', 'ke', 'ya', 'le', 'ka'])
bow_matrix = bow_vectorizer.fit_transform(tweets_df['text'])

In [4]:
lda_bow  = LatentDirichletAllocation(n_components=8, random_state=10)
lda_bow.fit(bow_matrix)

In [5]:
for idx, topic in enumerate(lda_bow.components_):
    print(f'Top 10 words in Topic #{idx}:')
    print([bow_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]) 
    print('')

Top 10 words in Topic #0:
['outage', 'hour', 'day', 'still', 'without', 'back', 'since', 'please', 'electricity', 'power']

Top 10 words in Topic #1:
['union', 'money', 'diesel', 'employee', 'price', 'billion', 'increase', 'pay', 'strike', 'worker']

Top 10 words in Topic #2:
['saa', 'country', 'black', 'must', 'people', 'white', 'ceo', 'ruyter', 'de', 'anc']

Top 10 words in Topic #3:
['work', 'know', 'day', 'guy', 'go', 'even', 'people', 'time', 'like', 'loadshedding']

Top 10 words in Topic #4:
['private', 'supply', 'new', 'electricity', 'company', 'country', 'coal', 'need', 'energy', 'power']

Top 10 words in Topic #5:
['capture', 'loadshedding', 'zuma', 'people', 'know', 'like', 'koko', 'anc', 'state', 'problem']

Top 10 words in Topic #6:
['electricity', 'blackout', 'week', 'hour', 'schedule', 'power', 'shedding', 'load', 'loadshedding', 'stage']

Top 10 words in Topic #7:
['cape', '2022', 'news', 'loadshedding', 'station', 'area', 'reduction', 'power', 'shedding', 'load']



In [6]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_bow, bow_matrix, bow_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [7]:
topics = {
    0: 'Loadshedding for longer than scheduled',
    1: 'Eskom workers striking',
    2: 'Current Eskom CEO Andre de Ruyter',
    3: 'Loadshedding times',
    4: 'Different ways to generate electricity',
    5: 'ANC State capture',
    6: 'Loadshedding for longer than scheduled',
    7: 'Loadshedding by city'
}

In [8]:
topic_classifications = lda_bow.transform(bow_matrix)
topic_values = np.apply_along_axis(np.argmax, 1, topic_classifications)

In [9]:
topics_df = pd.DataFrame(topic_values, columns=['topic'])
topics_df['topic'] = topics_df['topic'].apply(lambda x: topics[x])
topics_df.head()

Unnamed: 0,topic
0,Different ways to generate electricity
1,Different ways to generate electricity
2,ANC State capture
3,Different ways to generate electricity
4,Loadshedding times


In [10]:
tweet_topics_df = tweets_df
tweet_topics_df['topic'] = topics_df['topic']
tweet_topics_df.head()

Unnamed: 0,datetime,tweet_id,text,username,like_count,display_name,lang,topic
0,2022-07-15 23:56:53+00:00,1548094282900221953,government considering creating second state o...,TimesLIVE,85,Times LIVE,en,Different ways to generate electricity
1,2022-07-15 23:50:41+00:00,1548092722166132738,way fix current energy crisis splitting eskom ...,KingTNgema,0,Born A King,en,Different ways to generate electricity
2,2022-07-15 23:43:00+00:00,1548090785450434566,awarded pushed zuma brian molefe guptas mine g...,Constitution_94,0,Constitution First 🇿🇦,en,ANC State capture
3,2022-07-15 23:39:22+00:00,1548089872287555584,use gold reserve sort eskom sa debt market fac...,mdange39,0,Tupac,en,Different ways to generate electricity
4,2022-07-15 23:33:36+00:00,1548088419787517957,eskom really realigning sleep schedule,kokovee,0,In ❤ with the Koko,en,Loadshedding times


In [11]:
tweets_df.to_csv('data/twitter_topic_classification.csv', index = False)