In [4]:
import numpy as np
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

In [2]:
spam_dataset = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'],
                           skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].map({'ham': 0, 'spam': 1})

In [29]:
stopwords = set(nltk.corpus.stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def lemmatize_text(spam_text):
    processed_text = []
    for text in spam_text:
        text = text.lower()
        text = ''.join([char for char in text
                if char not in string.punctuation])
        text = word_tokenize(text)
        text = [t for t in text if t not in stopwords]
        text = [lemmatizer.lemmatize(t) for t in text]
        processed_text.append(text)
    return processed_text

In [30]:
spam_dataset['Lemmatized_Text'] = lemmatize_text(spam_dataset['Text'])

In [31]:
spam_dataset['Lemmatized_Text']

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, go, usf, life, around, though]
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å£750, po...
5568                   [ì, b, going, esplanade, fr, home]
5569                      [pity, mood, soany, suggestion]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: Lemmatized_Text, Length: 5572, dtype: object

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

count_v2 = CountVectorizer(max_df=0.1, max_features=500)
X_count_v2 = count_v2.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x)))
lda = LatentDirichletAllocation(n_components=7, random_state=2022,
                                learning_method='batch')
X_topics = lda.fit_transform(X_count_v2)

In [34]:
feature_names = count_v2.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f'Temat: {topic_idx+1}')
    print(' '.join([feature_names[i] for i in topic.argsort()[:-5-1:-1]]))
    print()

Temat: 1
love good day hi happy

Temat: 2
time come get yeah give

Temat: 3
ur ltgt txt send stop

Temat: 4
go im lor home ok

Temat: 5
dont know im want like

Temat: 6
call free ok phone text

Temat: 7
call na thats please claim

