#  Analiza tematu dokumentu

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import itertools
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('future.no_silent_downcasting', True)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jtadych/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
spam_dataset = pd.read_csv('spam.csv', encoding="ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'], skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])

In [4]:
def remove_punctation(text):
    return ''.join([word for word in text if word not in string.punctuation])
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_punctation(x))

def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text
spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

In [5]:
stopwords = nltk.corpus.stopwords.words("english")
def remove_stopwords(text):
    return [word for word in text if word not in stopwords]
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))

In [6]:
stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))


In [7]:
lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))

In [10]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_v2 = CountVectorizer(max_df=0.1, max_features=500)
X_count_v2 = count_v2.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x)))
lda = LatentDirichletAllocation(n_components=7, random_state=2022, learning_method='batch')
X_topics = lda.fit_transform(X_count_v2)

7

In [19]:
feature_names = count_v2.get_feature_names_out()
for topic_idx , topic in enumerate(lda.components_):
    print('Temat: {}'.format(topic_idx+1))
    print('' . join([feature_names[i] for i in topic.argsort()[:-5-1:-1]]))
    print()

Temat: 1
lovegooddayhihappy

Temat: 2
timecomegetyeahgive

Temat: 3
urltgttxtsendstop

Temat: 4
goimlorhomeok

Temat: 5
dontknowimwantlike

Temat: 6
callfreeokphonetext

Temat: 7
callnathatspleaseclaim



In [16]:
print(feature_names)

['10' '100' '1000' '150' '150p' '150ppm' '16' '18' '1st' '2000' '250'
 '2nd' '500' '5000' '750' '800' '8007' '86688' 'able' 'abt' 'account'
 'actually' 'address' 'aft' 'afternoon' 'ah' 'aight' 'already' 'alright'
 'also' 'always' 'amp' 'angry' 'another' 'answer' 'anything' 'anyway'
 'apply' 'ard' 'around' 'ask' 'asked' 'attempt' 'await' 'award' 'awarded'
 'away' 'awesome' 'babe' 'baby' 'back' 'bad' 'beautiful' 'bed' 'believe'
 'best' 'better' 'big' 'birthday' 'bit' 'bonus' 'book' 'bored' 'box' 'boy'
 'break' 'bring' 'brother' 'bt' 'bus' 'busy' 'buy' 'call' 'called'
 'calling' 'came' 'camera' 'cant' 'car' 'card' 'care' 'cash' 'cause'
 'chance' 'change' 'charge' 'chat' 'check' 'claim' 'class' 'close' 'club'
 'co' 'code' 'collect' 'collection' 'colour' 'come' 'coming' 'contact'
 'cool' 'cost' 'could' 'coz' 'credit' 'customer' 'da' 'dad' 'dat' 'date'
 'day' 'de' 'dear' 'delivery' 'den' 'detail' 'didnt' 'dinner' 'dis'
 'doesnt' 'done' 'dont' 'double' 'draw' 'dream' 'drink' 'drive' 'driving'