In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/Spam.csv')

In [3]:
df.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Type     5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
import spacy
from spacy.lookups import Lookups

nlp = spacy.load("en_core_web_sm")
df["Message"] = df["Message"].apply(lambda message: ' '.join([word.lemma_ for word in nlp(message)
                                                              if word.lower_ not in nlp.Defaults.stop_words])
                                   )

In [6]:
import numpy as np

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

bow = count_vectorizer = CountVectorizer(ngram_range = (1,2), min_df=2)
x_bow = np.asarray(
    bow.fit_transform(df['Message']).todense()
)

y = df['Type'].replace(['ham','spam'],[0,1])

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vectorizer = TfidfVectorizer(sublinear_tf=True)

x_tfidf = np.asarray(
    tf_vectorizer.fit_transform(df["Message"]).todense()
)

In [9]:
from sklearn.feature_selection import SelectKBest

x_bow = SelectKBest(k=30).fit_transform(x_bow, y)

x_tfidf = SelectKBest(k=30).fit_transform(x_tfidf, y)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier

def test(X,y):
    X_train, X_test,y_train,y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)
    bac = BaggingClassifier().fit(X_train, y_train)
    print(classification_report(y_test, bac.predict(X_test)))

In [15]:
test(x_bow, y)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       723
           1       0.96      0.83      0.89       113

    accuracy                           0.97       836
   macro avg       0.97      0.91      0.94       836
weighted avg       0.97      0.97      0.97       836



In [16]:
test(x_tfidf, y)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       723
           1       0.94      0.91      0.93       113

    accuracy                           0.98       836
   macro avg       0.97      0.95      0.96       836
weighted avg       0.98      0.98      0.98       836



In [17]:
tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2,
                                max_features= 1000,
                                stop_words = 'english')
tf = tf_vectorizer.fit_transform(df["Message"])

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(
    max_iter = 20,
    learning_method='online',
    learning_offset=50.,
    random_state=0
).fit(tf)

In [20]:
LDA.transform(tf).shape

(5572, 10)

In [21]:
features_names = tf_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(LDA.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([features_names[i] for i in topic.argsort()[:-11:-1]]))

Topic #0:
good love sorry day later happy dear think tomorrow morning
Topic #1:
ok send oh message right phone watch pick place ll
Topic #2:
lor da wat ask ok finish cos wan dun ur
Topic #3:
gt lt free ur reply msg txt send nokia 150p
Topic #4:
come time great thing leave life like buy friend feel
Topic #5:
know want tell need week pls let wait yeah tone
Topic #6:
yes number stop try com www thank reach claim account
Topic #7:
hi like miss text way hey new say start home
Topic #8:
cash prize claim customer ya service award win ur holiday
Topic #9:
work night sleep late meet fine mean problem day person
