In [27]:
import zipfile
with zipfile.ZipFile("ru_translated.zip","r") as zip_ref:
    zip_ref.extractall("targetdir")

In [24]:
zip_ref

<zipfile.ZipFile [closed]>

In [28]:
import pandas as pd
#df = pd.read_csv('translated_russian.csv')
df = pd.read_csv('targetdir/ru_translated.csv')

In [29]:
df.shape

(42195, 2)

In [30]:
df.isna().sum()

Unnamed: 0    0
text          0
dtype: int64

In [31]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import  Counter


vectorizer = CountVectorizer(
analyzer='word',       
min_df=3,# minimum required occurences of a word 
stop_words='english',# remove stop words
lowercase=True,# convert all words to lowercase
token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
max_features=5000,# max number of unique words
                            )


In [33]:
df = df[df['text'].notna()]


In [35]:
data_matrix = vectorizer.fit_transform(df.text)
data_matrix

<42195x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 475534 stored elements in Compressed Sparse Row format>

In [36]:
df

Unnamed: 0.1,Unnamed: 0,text
0,0,russian invasion in ukraine ukraine invasion k...
1,1,russian surrender it makes no sense to die for...
2,2,morgenshterh give me a kick durov why doesn't ...
3,3,microdistrict danilovka explosion video danilo...
4,4,kyiv ukraine ukraine do not plan to evacuate t...
...,...,...
42190,42190,the day of the full-scale invasion of Russian ...
42191,42191,villagers help APU collect trophies russian uk...
42192,42192,the day of the full-scale invasion of Russian ...
42193,42193,"under the butt an hour ago, our ace pilot shot..."


In [37]:
df.shape

(42195, 2)

In [42]:
# I will use LDA to create topics along with the probability distribution for each word in our vocabulary for each topic
lda_model = LatentDirichletAllocation(
n_components=5, # Number of topics
learning_method='online',
random_state=20,       
n_jobs = -1  # Use all available CPUs
                                     )


In [43]:
lda_output = lda_model.fit_transform(data_matrix)

In [44]:
lda_output

array([[0.62132702, 0.01829607, 0.20535831, 0.13652239, 0.0184962 ],
       [0.84813064, 0.00690293, 0.0693344 , 0.00691878, 0.06871326],
       [0.08194946, 0.08424428, 0.01333798, 0.13475594, 0.68571235],
       ...,
       [0.5552592 , 0.28853079, 0.06601536, 0.07756831, 0.01262633],
       [0.27926521, 0.00838747, 0.23651872, 0.46745484, 0.00837376],
       [0.43984344, 0.03381962, 0.45864358, 0.03389769, 0.03379567]])

In [45]:
#  top 10 most frequent words from each topic that found by LDA
import csv
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
for i,topic in enumerate(lda_model.components_):
    print('Top 10 words for topic:',i)
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')


Top 10 words for topic: 0
['ukrainerussianwar', 'don', 'stoprussia', 'stopputin', 'standwithukraine', 'stopputinnow', 'war', 'russia', 'putin', 'ukraine']


Top 10 words for topic: 1
['help', 'fuck', 'enemy', 'europe', 'odessa', 'destroyed', 'commander', 'tank', 'nikolaev', 'ukraine']


Top 10 words for topic: 2
['ukraine', 'forces', 'city', 'federation', 'kharkov', 'war', 'ukrainian', 'region', 'russiaukrainewar', 'russian']


Top 10 words for topic: 3
['kharkiv', 'russia', 'georgia', 'support', 'people', 'kyiv', 'war', 'russiaukrainewar', 'vsrf', 'ukraine']


Top 10 words for topic: 4
['putin', 'moscow', 'world', 'people', 'russian', 'war', 'russianukrainianwar', 'news', 'russia', 'ukraine']




From the given topics found we can conclude that :

Topic 0 Anti War Hashtags

Topic 1 Ground Level Conflicts

Topic 2 Kharkov Conflict

Topic 3 Support Talk

Topic 4 Media Reporting 


In [54]:
df = df.drop(['TopicName'], axis = 1)

In [55]:
# adding a new topic column in the dataframe based on the probability value, the suitable topic
topic_values = lda_model.transform(data_matrix)
df['TopicNum'] = topic_values.argmax(axis=1)
df.head(3)

Unnamed: 0.1,Unnamed: 0,text,TopicNum
0,0,russian invasion in ukraine ukraine invasion k...,0
1,1,russian surrender it makes no sense to die for...,0
2,2,morgenshterh give me a kick durov why doesn't ...,4


In [52]:
df.loc[df["TopicNum"] == 0, "TopicName"] = 'Anti War Hashtags'
df.loc[df["TopicNum"] == 1, "TopicName"] = 'Ground Level Conflicts'
df.loc[df["TopicNum"] == 2, "TopicName"] = 'Kharkov Conflict'
df.loc[df["TopicNum"] == 3, "TopicName"] = 'Support Talk'
df.loc[df["TopicNum"] == 4, "TopicName"] = 'Media Reporting'


In [53]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,text,TopicNum,TopicName
0,0,russian invasion in ukraine ukraine invasion k...,0,Anti War Hashtags
1,1,russian surrender it makes no sense to die for...,0,Anti War Hashtags
2,2,morgenshterh give me a kick durov why doesn't ...,4,Media Reporting
3,3,microdistrict danilovka explosion video danilo...,0,Anti War Hashtags
4,4,kyiv ukraine ukraine do not plan to evacuate t...,3,Support Talk
5,5,"if, after the last nights of defense, someone ...",0,Anti War Hashtags
6,6,belarus ukraine russia the blow is aimed at th...,4,Media Reporting
7,7,"I know it’s hard to accept it all, but underst...",0,Anti War Hashtags
8,8,this is the attitude to war that the whole wor...,0,Anti War Hashtags
9,9,anonymous group directly appealed to Putin ukr...,0,Anti War Hashtags
