# Лабораторная работа №10. ОСНОВЫ ОБРАБОТКИ ЕСТЕСТВЕННОГО ЯЗЫКА (NLP). ЗАДАЧА ТЕМАТИЧЕСКОГО МОДЕЛИРОВАНИЯ


ЗАДАНИЕ
1. Для выполнения задания используйте датасет с данными о спаме (https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset).
2. ****Самостоятельно реализовать BoW, TF-IDF.
3. Решить задачу классификации с понижением размерности. 

****Использовать самостоятельно реализованные модели из предыдущих ЛР.

4. Решить задачу тематического моделирования с помощью LDA.

## Импорт датасета

In [212]:
import pandas as pd
df = pd.read_csv('../../Datasets/Lab10_Dataset10_Spam.csv')
df.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Лемматизация и удаление стоп слов

In [213]:
import spacy
nlp = spacy.load("en_core_web_sm")

df['Message'] = df['Message'].apply(
    lambda message: ' '.join([word.lemma_ for word in nlp(message) 
                    if word.lower_ not in nlp.Defaults.stop_words])
)

## Векторизация с помощью BoW

In [214]:
import numpy as np

In [215]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(ngram_range=(1,2), min_df=2)
x_bow = np.asarray(
    bow.fit_transform(df['Message']).todense()
)

y = df['Type'].replace(['ham','spam'],[0,1])

## Векторизация с помощью TF-IDF

In [216]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vectorizer = TfidfVectorizer(sublinear_tf=True)

x_tfidf = np.asarray(
    tf_vectorizer.fit_transform(df['Message']).todense()
)


## Уменьшение размерности

In [217]:
from sklearn.feature_selection import SelectKBest

x_bow = SelectKBest(k=30).fit_transform(x_bow, y)

x_tfidf = SelectKBest(k=30).fit_transform(x_tfidf, y)

## Решение задачи классификации

In [218]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def test_clsf(X, y, model):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
  model = model.fit(X_train, y_train)
  print(classification_report(y_test, model.predict(X_test)))

In [219]:
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [220]:
estimators = [('GradientBoostingClassifier',GradientBoostingClassifier()),
              ('LogisticRegression',LogisticRegression()),
              ('KNeighborsClassifier',KNeighborsClassifier())]

In [221]:
test_clsf(x_bow, y, StackingClassifier(estimators))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       723
           1       0.94      0.81      0.87       113

    accuracy                           0.97       836
   macro avg       0.96      0.90      0.93       836
weighted avg       0.97      0.97      0.97       836



In [222]:
test_clsf(x_tfidf, y, StackingClassifier(estimators))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       723
           1       0.96      0.88      0.92       113

    accuracy                           0.98       836
   macro avg       0.97      0.94      0.95       836
weighted avg       0.98      0.98      0.98       836



## Решение задачи тематического моделирования с помощью LDA

In [223]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=1000,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(df['Message'])

In [224]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    max_iter=20,
    learning_method='online',
    learning_offset=50.,
    random_state=0
).fit(tf)

In [225]:
lda.transform(tf).shape

(5572, 10)

In [226]:
feature_names = tf_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Topic #0:
good love sorry day later happy dear think tomorrow morning
Topic #1:
ok send oh message right phone watch pick place ll
Topic #2:
lor da wat ask ok finish cos wan dun ur
Topic #3:
gt lt free ur reply msg txt send nokia 150p
Topic #4:
come time great thing leave life like buy friend feel
Topic #5:
know want tell need week pls let wait yeah tone
Topic #6:
yes number stop try com www thank reach claim account
Topic #7:
hi like miss text way hey new say start home
Topic #8:
cash prize claim customer ya service award win ur holiday
Topic #9:
work night sleep late meet fine mean problem day person
