Выполнил: Матиенко А.П.

Группа: ИУ5-24М

Задание: Необходимо решить задачу классификации текстов, сформировав два варианта векторизации признаков - на основе CountVectorizer и на основе TfidfVectorizer. В качестве классификаторов необходимо использовать два классификатора:

- KNeighborsClassifier

- Complement Naive Bayes

In [1]:
import os
import gzip
import shutil

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
datasets_dirname = './datasets/'
if os.path.exists(datasets_dirname) == 0:
    os.mkdir(datasets_dirname)
    
with gzip.open(datasets_dirname + 'clickbait_data.gz', 'rb') as f_in:
    with open(datasets_dirname + 'clickbait_data.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
with gzip.open(datasets_dirname + 'non_clickbait_data.gz', 'rb') as f_in:
    with open(datasets_dirname + 'non_clickbait_data.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
clickbait = pd.read_csv(datasets_dirname + 'clickbait_data.txt', delimiter='__label__ ', header=None, names=['text'])
clickbait['target'] = 1
clickbait

non_clickbait = pd.read_csv(datasets_dirname + 'non_clickbait_data.txt', delimiter='__label__ ', header=None, names=['text'])
non_clickbait['target'] = 0

df = pd.concat([clickbait, non_clickbait], axis=0)
df = df.sample(frac=1)

df.to_csv(datasets_dirname + 'dataset.csv', index=False)
os.remove(datasets_dirname + 'clickbait_data.txt')
os.remove(datasets_dirname + 'non_clickbait_data.txt')

df

Unnamed: 0,text,target
10419,18 Breathtaking Mexican Treats That Will Make ...,1
15252,Australian federal election announced for Augu...,0
11748,Delayed Endeavour carries Japanese lab to Inte...,0
7242,23 Hilarious Tumblr Posts That Students Will T...,1
13911,At Least 21 Polo Horses Are Dead; Toxin Is Sus...,0
...,...,...
6554,21 Things You Need For Your Baby If You Were I...,1
1718,2007 Twenty20 World Championship: India vs Sco...,0
14469,Can You Pick The Right Guy,1
5226,Questions and Answers With Ramon Scruggs,0


# Feature preparation

In [3]:
tfidfv = TfidfVectorizer()
tfidf_ngram_features = tfidfv.fit_transform(df['text'])
tfidf_ngram_features

<32000x22761 sparse matrix of type '<class 'numpy.float64'>'
	with 283201 stored elements in Compressed Sparse Row format>

In [4]:
countvec = CountVectorizer()
countvec_ngram_features = countvec.fit_transform(df['text'])
countvec_ngram_features

<32000x22761 sparse matrix of type '<class 'numpy.int64'>'
	with 283201 stored elements in Compressed Sparse Row format>

# KNeighboursClassifier

In [5]:
# TFIDF + KNC
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, df['target'], test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9738    0.9042    0.9377      4844
           1     0.9091    0.9752    0.9410      4756

    accuracy                         0.9394      9600
   macro avg     0.9414    0.9397    0.9393      9600
weighted avg     0.9417    0.9394    0.9393      9600



In [6]:
# CountVec + KNC
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, df['target'], 
                                                    test_size=0.3, random_state=1)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.8684    0.7725    0.8177      4844
           1     0.7917    0.8808    0.8339      4756

    accuracy                         0.8261      9600
   macro avg     0.8301    0.8266    0.8258      9600
weighted avg     0.8304    0.8261    0.8257      9600



# Complement Naive Bayes

In [7]:
# TFIDF + CNB
X_train, X_test, y_train, y_test = train_test_split(tfidf_ngram_features, df['target'], test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9875    0.9449    0.9657      4844
           1     0.9462    0.9878    0.9666      4756

    accuracy                         0.9661      9600
   macro avg     0.9669    0.9663    0.9661      9600
weighted avg     0.9670    0.9661    0.9661      9600



In [8]:
# CountVec + CNB
X_train, X_test, y_train, y_test = train_test_split(countvec_ngram_features, df['target'], 
                                                    test_size=0.3, random_state=1)
model = ComplementNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4, target_names=list(map(str, list(y_test.unique())))))

              precision    recall  f1-score   support

           0     0.9851    0.9525    0.9685      4844
           1     0.9532    0.9853    0.9690      4756

    accuracy                         0.9688      9600
   macro avg     0.9691    0.9689    0.9687      9600
weighted avg     0.9693    0.9688    0.9687      9600



# Выводы:
1. TfidfVectorizer показал лучший результат в обоих моделях

2. Complement Naive Bayes показал лучший результат по сравнению с Random Forest

In [9]:
os.remove('datasets/dataset.csv')