In [2]:
import pandas as pd

df_it = pd.read_csv("data/ITSecurity.csv")
df_se = pd.read_csv("data/SE.csv")
df_stack_1 = pd.read_csv("data/SO<2016.csv")
df_stack_2 = pd.read_csv("data/SO=2016.csv")
df_stack_3 = pd.read_csv("data/SO>2016.csv")

df_it['File'] = 'ITSecurity'
df_se['File'] = 'SE'
df_stack_1['File'] = 'Stack'
df_stack_2['File'] = 'Stack'
df_stack_3['File'] = 'Stack'

df_combined = pd.concat([df_it, df_se, df_stack_1,
                         df_stack_2, df_stack_3], axis=0)

df_combined.head(10)


Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,ParentId,CreationDate,DeletionDate,Score,ViewCount,Body,OwnerUserId,...,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,CommunityOwnedDate,ContentLicense,File
0,13180,1,,,2012-03-28 14:50:18,,0,911,<p>Mod_security has a rules updater distribute...,8552.0,...,2012-03-28 18:50:48,Mod_security rules-updater.pl fails to pull ne...,<apache><mod-security><updates>,1,0,,,,CC BY-SA 3.0,ITSecurity
1,223801,1,,,2020-01-07 09:22:01,,5,1347,<p>Proprietary software developed by a (smalli...,50647.0,...,2020-01-07 10:26:35,Risks of allowing employees using personal Git...,<account-security><github>,1,1,,,,CC BY-SA 4.0,ITSecurity
2,223825,1,223835.0,,2020-01-07 17:50:09,,1,197,<p>I am a junior web developer. All I know is ...,224599.0,...,2020-01-07 21:39:58,Risk of Docker backdoor allowing impersonation,<linux><account-security><forensics><docker><p...,1,1,,,,CC BY-SA 4.0,ITSecurity
3,161027,1,,,2017-06-02 08:45:28,,0,164,<p>As we know most of the malwares create thou...,108636.0,...,2017-09-30 19:21:29,limiting automated domain creation by malware,<malware><ransomware><security-by-design>,2,1,,,,CC BY-SA 3.0,ITSecurity
4,161112,1,161113.0,,2017-06-03 09:41:40,,4,429,<p>So I get these questions quite a lot recent...,148081.0,...,2017-06-05 21:00:45,Cleanware. What does it do and are there secur...,<malware><software><security-theater>,1,1,,,,CC BY-SA 3.0,ITSecurity
5,161142,1,161150.0,,2017-06-03 19:08:27,,2,3625,"<p>so let's say I had a basic socket, doesn't ...",78932.0,...,2017-06-03 21:27:04,Authenticate a user thru a UDP socket,<authentication><server><account-security><udp...,2,4,,,,CC BY-SA 3.0,ITSecurity
6,94070,1,94082.0,,2015-07-16 05:29:58,,12,4756,"<p>As you can see from the tag, I know that <a...",37853.0,...,2015-07-17 12:11:42,Would making an IIS web server appear to be ru...,<webserver><security-theater>,4,6,,,,CC BY-SA 3.0,ITSecurity
7,224090,1,,,2020-01-12 22:32:12,,1,610,<p>Do E-Mail proxy services exists to improve ...,65956.0,...,2023-07-28 18:01:55,E-Mail privacy proxy for hiding real e-mail?,<privacy><email><account-security>,4,2,,,,CC BY-SA 4.0,ITSecurity
8,224115,1,,,2020-01-13 12:47:56,,0,165,"<p><a href=""https://i.stack.imgur.com/8GmSI.pn...",224946.0,...,2020-01-13 13:01:52,Why there are some weird requests to my web ho...,<tls><http><account-security>,1,3,,2020-01-13 14:47:39,,CC BY-SA 4.0,ITSecurity
9,224176,1,224185.0,,2020-01-14 10:48:26,,0,1914,<p>I want to be active on Twitter against our ...,225018.0,...,2020-01-15 15:13:57,Can my oppressive government trace my Twitter ...,<privacy><account-security><web><twitter>,2,4,,,,CC BY-SA 4.0,ITSecurity


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('all')
stop_words = set(stopwords.words('english'))

additional_stopwords = (['using', 'working', 'like', 'could',
                         'set', 'change', 'value', 'get',
                         'load', 'getting', 'new', 'use',
                         'two', 'one', 'running', 'filter'])

stop_words.update(additional_stopwords)

def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalnum()
                      and word.lower() not in stop_words]
    return ' '.join(filtered_words)

df_combined['processed_text'] = df_combined['Title'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_combined['processed_text'])

lda = LatentDirichletAllocation(n_components=15, random_state=42)
lda.fit(tfidf_matrix)

def get_top_words_per_topic(model, vectorizer, n_words=10):
    words = vectorizer.get_feature_names_out()
    topic_words = []
    for topic_weights in model.components_:
        top_word_indices = topic_weights.argsort()[-n_words:][::-1]
        top_words = [words[i] for i in top_word_indices]
        topic_words.append(top_words)
    return topic_words

topic_words = get_top_words_per_topic(lda, tfidf_vectorizer)
topic_assignments = lda.transform(tfidf_matrix)

df_combined['CreationDate'] = pd.to_datetime(df_combined['CreationDate'])
df_combined = df_combined.sort_values(by='CreationDate')
df_combined['topic_assignment'] = topic_assignments.argmax(axis=1) + 1

for i, words in enumerate(topic_words):
    print(f"Topic {i+1}: {', '.join(words)}")

print(df_combined[['Title', 'topic_assignment', 'File', 'CreationDate']])

output_file_path = "topics.txt"

with open(output_file_path, "w", encoding="utf-8") as file:
    for i, words in enumerate(topic_words):
        file.write(f"Topic {i+1}: {', '.join(words)}\n")

    file.write(df_combined[['Title', 'topic_assignment',
                            'File', 'CreationDate']].to_string(index=False))


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/emily/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/emily/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/emily/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/emily/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/emily/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /home/emily/nltk_data

Topic 1: policy, content, csp, security, code, javascript, protect, html, script, source
Topic 2: certificate, ssl, grails, security, ldap, implement, spring, issues, plugin, ios
Topic 3: issue, system, directory, failed, security, hash, configure, object, soap, file
Topic 4: spring, security, boot, oauth2, login, authorization, authentication, page, custom, error
Topic 5: service, wcf, azure, web, function, ip, security, windows, safe, time
Topic 6: secure, password, post, ajax, security, form, request, website, symfony, login
Topic 7: key, data, store, android, secure, php, encryption, securely, passwords, app
Topic 8: google, java, security, roles, class, spring, implementation, config, cloud, user
Topic 9: rest, token, spring, authentication, api, jwt, security, oauth, custom, provider
Topic 10: work, spring, mvc, security, exception, test, control, setting, basic, error
Topic 11: security, spring, disable, session, null, logout, level, attack, csrf, applet
Topic 12: authenticate, 