In [1]:
import pandas as pd
df = pd.read_csv('translated_russian.csv')

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import  Counter


vectorizer = CountVectorizer(
analyzer='word',       
min_df=3,# minimum required occurences of a word 
stop_words='english',# remove stop words
lowercase=True,# convert all words to lowercase
token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
max_features=5000,# max number of unique words
                            )


In [6]:
data_matrix = vectorizer.fit_transform(df.text)
data_matrix

<17719x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 209850 stored elements in Compressed Sparse Row format>

In [7]:
df

Unnamed: 0.1,Unnamed: 0,text
0,1143139,russian invasion in ukraine ukraine invasion k...
1,1143146,russian surrender it makes no sense to die for...
2,1143405,morgenshterh give me a kick durov why doesn't ...
3,1143443,microdistrict danilovka explosion video danilo...
4,1143509,kyiv ukraine ukraine do not plan to evacuate t...
...,...,...
17714,980987,блядьське нато ну не быть гнидой nato close th...
17715,981074,"in kharkiv, the building of the regional polic..."
17716,981157,ukraine received another batch of javelins fro...
17717,981206,"Russians, according to the most modest data fo..."


In [8]:
# I will use LDA to create topics along with the probability distribution for each word in our vocabulary for each topic
lda_model = LatentDirichletAllocation(
n_components=5, # Number of topics
learning_method='online',
random_state=20,       
n_jobs = -1  # Use all available CPUs
                                     )


In [9]:
lda_output = lda_model.fit_transform(data_matrix)

In [10]:
lda_output

array([[0.3285421 , 0.01831504, 0.33857794, 0.29618029, 0.01838462],
       [0.65211615, 0.0902047 , 0.03930622, 0.00718822, 0.21118472],
       [0.8687479 , 0.09081266, 0.01351112, 0.01335093, 0.0135774 ],
       ...,
       [0.20501197, 0.15959982, 0.1448616 , 0.12476632, 0.3657603 ],
       [0.14523969, 0.08198579, 0.44986866, 0.01054108, 0.31236478],
       [0.01538533, 0.54285344, 0.18501419, 0.01538515, 0.24136188]])

In [11]:
#  top 10 most frequent words from each topic that found by LDA
import csv
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
for i,topic in enumerate(lda_model.components_):
    print('Top 10 words for topic:',i)
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')


Top 10 words for topic: 0
['ukrainian', 'kyiv', 'defense', 'forces', 'know', 'odessa', 'don', 'vsrf', 'russiaukrainewar', 'ukraine']


Top 10 words for topic: 1
['kyiv', 'minister', 'sanctions', 'said', 'georgia', 'people', 'news', 'russia', 'putin', 'ukraine']


Top 10 words for topic: 2
['soldiers', 'army', 'stop', 'military', 'ukrainerussiawar', 'russiaukrainewar', 'war', 'russian', 'russia', 'ukraine']


Top 10 words for topic: 3
['shelling', 'residential', 'kyiv', 'kharkov', 'kharkiv', 'city', 'ukraine', 'region', 'russiaukrainewar', 'russian']


Top 10 words for topic: 4
['standwithukraine', 'stopwar', 'stopputin', 'population', 'people', 'russia', 'speaking', 'putin', 'war', 'ukraine']




From the given topics found we can conclude that :

Topic 0 General Ukraine-Russia Conflict

Topic 1 Politics and Media

Topic 2 Soldier Movement

Topic 3 Attack on Kharkov

Topic 4 Public opinions and hashtags.

In [12]:
# adding a new topic column in the dataframe based on the probability value, the suitable topic
topic_values = lda_model.transform(data_matrix)
df['TopicNum'] = topic_values.argmax(axis=1)
df.head(3)

Unnamed: 0.1,Unnamed: 0,text,TopicNum
0,1143139,russian invasion in ukraine ukraine invasion k...,2
1,1143146,russian surrender it makes no sense to die for...,0
2,1143405,morgenshterh give me a kick durov why doesn't ...,0


In [14]:
df.loc[df["TopicNum"] == 0, "TopicName"] = 'General Ukraine-Russia Conflict'
df.loc[df["TopicNum"] == 1, "TopicName"] = 'Politics and Media'
df.loc[df["TopicNum"] == 2, "TopicName"] = 'Soldier Movement'
df.loc[df["TopicNum"] == 3, "TopicName"] = 'Attack on Kharkov'
df.loc[df["TopicNum"] == 4, "TopicName"] = 'Public opinions and hashtags'


In [15]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,text,TopicNum,TopicName
0,1143139,russian invasion in ukraine ukraine invasion k...,2,Soldier Movement
1,1143146,russian surrender it makes no sense to die for...,0,General Ukraine-Russia Conflict
2,1143405,morgenshterh give me a kick durov why doesn't ...,0,General Ukraine-Russia Conflict
3,1143443,microdistrict danilovka explosion video danilo...,1,Politics and Media
4,1143509,kyiv ukraine ukraine do not plan to evacuate t...,4,Public opinions and hashtags
5,1143788,"if, after the last nights of defense, someone ...",0,General Ukraine-Russia Conflict
6,1144842,belarus ukraine russia the blow is aimed at th...,4,Public opinions and hashtags
7,1145342,"I know it’s hard to accept it all, but underst...",4,Public opinions and hashtags
8,1145451,this is the attitude to war that the whole wor...,4,Public opinions and hashtags
9,1146108,anonymous group directly appealed to Putin ukr...,4,Public opinions and hashtags
