In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import VarianceThreshold, SelectKBest, RFE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, Isomap
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liza5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Обработка

In [5]:
data = pd.read_csv('data/spam.csv', encoding='ISO-8859-1')
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [6]:
data = data.drop('Unnamed: 2', axis=1)

In [7]:
data = data.drop('Unnamed: 3', axis=1)

In [8]:
data = data.drop('Unnamed: 4', axis=1)

In [9]:
data = data.drop_duplicates().reset_index(drop=True)
data.duplicated().sum()

0

In [10]:
data.loc[(data['v1'] == "ham"), 'v1'] = 1
data.loc[(data['v1'] == "spam"), 'v1'] = 0
data['v1'] = data['v1'].astype(int)

In [29]:
X = data['v2']
y = data['v1']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 2. Реализация BoW, TF-IDF

In [47]:
import re
from collections import Counter

def preprocess_text(text):
    # Простой препроцессинг: приводим к нижнему регистру и удаляем знаки препинания
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    return text

def create_bag_of_words(documents):
    # Список для хранения уникальных слов (токенов)
    vocabulary = set()

    # Список для хранения Bag of Words для каждого документа
    bow_matrix = []

    for document in documents:
        # Предварительная обработка текста
        preprocessed_doc = preprocess_text(document)

        # Разбиваем документ на слова
        words = preprocessed_doc.split()

        # Обновляем словарь
        vocabulary.update(words)

        # Создаем Bag of Words для текущего документа
        word_counts = Counter(words)
        bow_matrix.append(word_counts)

    # Преобразуем словарь в список слов (токенов)
    vocab_list = list(vocabulary)

    # Создаем матрицу Bag of Words
    bow_matrix = [
        [doc.get(word, 0) for word in vocab_list]
        for doc in bow_matrix
    ]

    return vocab_list, bow_matrix

In [55]:
# Создаем Bag of Words
vocabulary, bow_matrix = create_bag_of_words(X)

X_train, X_test, y_train, y_test = train_test_split(bow_matrix, y, test_size=0.2, random_state=42)

In [56]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9245647969052224


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
# Создание экземпляра CountVectorizer
vectorizer = CountVectorizer()

# Преобразование текстовых данных в матрицу признаков
X_bow = vectorizer.fit_transform(X)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [59]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9245647969052224


In [60]:
import math
from collections import Counter

def preprocess_text(text):
    # Простой препроцессинг: приводим к нижнему регистру и удаляем знаки препинания
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    return text

def compute_tf(text):
    # Вычисляем Term Frequency (TF)
    words = preprocess_text(text).split()
    word_counts = Counter(words)
    total_words = len(words)
    
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

def compute_idf(documents):
    # Вычисляем Inverse Document Frequency (IDF)
    total_documents = len(documents)
    idf = {}

    for document in documents:
        words = set(preprocess_text(document).split())
        for word in words:
            idf[word] = idf.get(word, 0) + 1
    
    idf = {word: math.log(total_documents / (count + 1)) for word, count in idf.items()}
    return idf

def compute_tfidf(documents):
    tfidf_matrix = []

    idf = compute_idf(documents)

    for document in documents:
        tf = compute_tf(document)
        tfidf_vector = {word: tf[word] * idf[word] for word in tf.keys()}
        tfidf_matrix.append(tfidf_vector)

    return tfidf_matrix

In [66]:
 tfidf_matrix = compute_tfidf(X)
for i, tfidf_vector in enumerate(tfidf_matrix):
    print(f"Document {i + 1} TF-IDF Vector: {tfidf_vector}")

Document 1 TF-IDF Vector: {'go': 0.15065501290887515, 'until': 0.2609115007510418, 'jurong': 0.39286436723180473, 'point': 0.2955688597790391, 'crazy': 0.29211921620469145, 'available': 0.28586105905699116, 'only': 0.1646469576584129, 'in': 0.09698692402391401, 'bugis': 0.3235496491758102, 'n': 0.19115233532418496, 'great': 0.1972632169603974, 'world': 0.26272988295958555, 'la': 0.3235496491758102, 'e': 0.20657969586997207, 'buffet': 0.3725911118263965, 'cine': 0.3235496491758102, 'there': 0.16235648085684817, 'got': 0.157387857166188, 'amore': 0.39286436723180473, 'wat': 0.1977657337530725}
Document 2 TF-IDF Vector: {'ok': 0.5035009062807694, 'lar': 0.8188080609116088, 'joking': 1.1007540626901209, 'wif': 0.8757662765319517, 'u': 0.322171924536927, 'oni': 1.1568327687936564}
Document 3 TF-IDF Vector: {'free': 0.0979489251924795, 'entry': 0.33975730581997565, 'in': 0.058779953953887285, '2': 0.0782341504609295, 'a': 0.04795531283012592, 'wkly': 0.1893287706727877, 'comp': 0.18644058340

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Создание экземпляра TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Преобразование текстовых данных в матрицу признаков
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Разделение на тренировочный и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [24]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9129593810444874


# 3. Понижение размерности

In [26]:
def test_class(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
  bag = BaggingClassifier().fit(X_train, y_train)
  print(classification_report(y_test, bag.predict(X_test)))

In [27]:
# SelectKBest
selector = SelectKBest(k=2)  # количество наиболее значимых признаков
X_SelectKBest_classifier1 = selector.fit_transform(X_bow, y)
X_SelectKBest_classifier2 = selector.fit_transform(X_tfidf, y)

In [28]:
test_class(X_SelectKBest_classifier1, y)

              precision    recall  f1-score   support

           0       0.66      0.60      0.63       117
           1       0.95      0.96      0.96       917

    accuracy                           0.92      1034
   macro avg       0.80      0.78      0.79      1034
weighted avg       0.92      0.92      0.92      1034



In [29]:
test_class(X_SelectKBest_classifier2, y)

              precision    recall  f1-score   support

           0       0.88      0.25      0.39       117
           1       0.91      1.00      0.95       917

    accuracy                           0.91      1034
   macro avg       0.90      0.62      0.67      1034
weighted avg       0.91      0.91      0.89      1034



In [30]:
# RFE
estimator = BaggingClassifier()
selector = RFE(estimator, n_features_to_select=5)  # количество выбранных признаков (n_features_to_select)
X_RFE_classifier1 = selector.fit_transform(X_SelectKBest_classifier1, y)
X_RFE_classifier2 = selector.fit_transform(X_SelectKBest_classifier2, y)

In [31]:
test_class(X_RFE_classifier1, y)

              precision    recall  f1-score   support

           0       0.66      0.60      0.63       117
           1       0.95      0.96      0.96       917

    accuracy                           0.92      1034
   macro avg       0.80      0.78      0.79      1034
weighted avg       0.92      0.92      0.92      1034



In [32]:

test_class(X_RFE_classifier2, y)

              precision    recall  f1-score   support

           0       0.90      0.24      0.38       117
           1       0.91      1.00      0.95       917

    accuracy                           0.91      1034
   macro avg       0.91      0.62      0.67      1034
weighted avg       0.91      0.91      0.89      1034



In [33]:
# Isomap
isomap = Isomap(n_components=10)  # количество компонент (n_components)
X_Isomap_classifier1 = isomap.fit_transform(X_bow)
X_Isomap_classifier2 = isomap.fit_transform(X_tfidf)

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


In [34]:

test_class(X_Isomap_classifier1, y)

              precision    recall  f1-score   support

           0       0.69      0.64      0.66       117
           1       0.95      0.96      0.96       917

    accuracy                           0.93      1034
   macro avg       0.82      0.80      0.81      1034
weighted avg       0.92      0.93      0.93      1034



In [35]:
test_class(X_Isomap_classifier2, y)

              precision    recall  f1-score   support

           0       0.58      0.47      0.52       117
           1       0.93      0.96      0.95       917

    accuracy                           0.90      1034
   macro avg       0.76      0.71      0.73      1034
weighted avg       0.89      0.90      0.90      1034



# 4. LDA

In [2]:
# Создание объекта CountVectorizer для извлечения признаков
vectorizer = CountVectorizer(max_features=1000,   # Ограничение на максимальное количество признаков
                             stop_words='english',  # Игнорирование стоп-слов
                             max_df=0.5,  # Игнорирование терминов, которые появляются в более чем 50% документов
                             min_df=2)  # Игнорирование терминов, которые появляются в менее чем 2 документах

In [30]:
# Преобразование текстовых данных в матрицу признаков
X_features = vectorizer.fit_transform(X)

In [32]:
# Создание модели LDA
num_topics = 10  # Количество тем
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

In [50]:
# Обучение модели LDA
lda_model.fit(X_features)

# Получение самых вероятных слов для каждой темы
feature_names = vectorizer.get_feature_names_out()
top_words = 10  # Количество верхних слов для вывода

In [55]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liza5\AppData\Roaming\nltk_data...


True

In [56]:
lemmatizer = WordNetLemmatizer()

for topic_idx, topic in enumerate(lda_model.components_):
    top_feature_indices = topic.argsort()[:-top_words - 1:-1]
    top_features = [lemmatizer.lemmatize(feature_names[i]) for i in top_feature_indices]
    print(f"Тема #{topic_idx + 1}:")
    print(top_features)
    print()

Тема #1:
['know', 'did', 'right', 'yes', 'just', 'tonight', 'say', 'gonna', 'phone', 'like']

Тема #2:
['lor', 'got', 'ok', 'oh', 'wat', 'pls', 'dun', 'wan', 'ask', 'ìï']

Тема #3:
['tell', 'just', 'don', 'yeah', 'sure', 'time', 'got', 'ok', 'going', 'll']

Тема #4:
['like', 'going', 'da', 'way', 'feel', 'think', 'time', 'ya', 'life', 'make']

Тема #5:
['claim', 'prize', 'won', 'number', 'cash', 'www', 'urgent', 'win', 'com', 'txt']

Тема #6:
['good', 'love', 'day', 'ì_', 'hi', 'hope', 'happy', 'morning', 'babe', 'miss']

Тема #7:
['ll', 'just', 'sorry', 'need', 'want', 'come', 'sent', 'dont', 'later', 'time']

Тема #8:
['txt', 'home', 'new', 'ur', 'chat', '150p', 'send', 'stop', 'week', 'free']

Тема #9:
['gt', 'lt', 'ur', 'ok', 'reply', 'msg', 'send', 'sm', 'heart', 'text']

Тема #10:
['free', 'text', 'know', 'mobile', 'stop', 'let', 'phone', 'reply', 'aight', 'txt']

