# Рубежный контроль №2

## Андреев Алексей ИУ5-23М
## Тема: Методы обработки текстов
## Решение задачи классификации текстов.

- Необходимо сформировать два варианта векторизации признаков - на основе CountVectorizer и на
основе TfidfVectorizer.

- В качестве классификаторов необходимо использовать два классификатора по варианту для Вашей
группы
### Группа: ИУ5-23М
### Классификатор 1: LinearSVC
### Классификатор 2: Multinomial Naive Bayes (MNB)

- Для каждого метода необходимо оценить качество классификации
- Сделать вывод о том, какой вариант векторизации признаков в паре с каким классификатором показал лучшее качество.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Анализируем датасет и готовим категориальный признак

In [2]:
SUBSAMPLE_SIZE = 10000

df_fake = pd.read_csv('fake-and-real-news/Fake.csv', encoding='utf-8')[:SUBSAMPLE_SIZE]
df_fake['target'] = np.zeros(df_fake.shape[0], dtype=np.int8)
df_true = pd.read_csv('fake-and-real-news/True.csv', encoding='utf-8')[:SUBSAMPLE_SIZE]
df_true['target'] = np.ones(df_true.shape[0], dtype=np.int8)
df = pd.concat((df_fake, df_true), axis=0)
df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,title,text,subject,date,target
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
9995,Obama says Clinton never jeopardized national ...,WASHINGTON (Reuters) - U.S. President Barack O...,politicsNews,"April 10, 2016",1
9996,U.S. plans to curb tax 'inversions' could hit ...,LONDON (Reuters) - Planned changes that Presid...,politicsNews,"April 11, 2016",1
9997,U.S. Democrat Clinton downplays chance of cont...,WASHINGTON (Reuters) - Democratic front-runner...,politicsNews,"April 10, 2016",1
9998,Boston Globe denounces Trump candidacy in 'fro...,(Reuters) - Headlines screaming “Deportations ...,politicsNews,"April 10, 2016",1


In [3]:
df.target.value_counts()

0    10000
1    10000
Name: target, dtype: int64

In [4]:
df.subject.value_counts()

politicsNews    10000
News             9050
politics          950
Name: subject, dtype: int64

In [5]:
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

encoder = LabelEncoder()
scaler = StandardScaler()
df['subject'] = encoder.fit_transform(df.subject)

In [6]:
df.subject.unique()

array([0, 1, 2])

In [7]:
(df.subject - df.target).sum()

10950

In [8]:
df.drop(columns=['date'], inplace=True)

## Делим данные на две выборки train и test

In [9]:
from sklearn.model_selection import train_test_split

X = df[[i for i in df.columns if i !='target']]
y = df.target

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16000, 3), (16000,), (4000, 3), (4000,))

## Предобрабатываем текстовые данные

In [10]:
for title in df.title:
  print(title, type(title))
  break

 Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing <class 'str'>


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

def preproc_func(title):
  # make more preprocessing if it well be needed!
  if isinstance(title, str):
      title = re.sub('[^a-zA-Z0-9]', ' ', title)
      return title.lower().strip()
  else:
    return ''

def get_tfidf_matrix(df, column, preproc_function, vectorizer=None):
  """
    returns matrix, trained vectorizer
  """
  processed_col = df[column].apply(preproc_function)
  if vectorizer is None:
    vectorizer = TfidfVectorizer()
    vectorizer.fit(processed_col)

  matrix = vectorizer.transform(processed_col)
  
  return matrix, vectorizer

def get_count_matrix(df, column, preproc_function, vectorizer=None):
  """
    returns matrix, trained vectorizer
  """
  processed_col = df[column].apply(preproc_function)
  if vectorizer is None:
    vectorizer = CountVectorizer()
    vectorizer.fit(processed_col)

  matrix = vectorizer.transform(processed_col)
  return matrix, vectorizer


train_title_matrix_tfidf, tfidf_vectorizer = get_tfidf_matrix(x_train, 'title', preproc_func)
test_title_matrix_tfidf, tfidf_vectorizer = get_tfidf_matrix(x_test, 'title',
                                                       preproc_func, vectorizer=tfidf_vectorizer)

train_title_matrix_count, count_vectorizer = get_count_matrix(x_train, 'title', preproc_func)
test_title_matrix_count, count_vectorizer = get_count_matrix(x_test, 'title',
                                                       preproc_func, vectorizer=count_vectorizer)

train_title_matrix_tfidf.shape, test_title_matrix_tfidf.shape, train_title_matrix_count.shape, test_title_matrix_count.shape

((16000, 13240), (4000, 13240), (16000, 13240), (4000, 13240))

In [14]:
train_text_matrix_tfidf, tfidf_text_vectorizer = get_tfidf_matrix(x_train, 'text', preproc_func)
test_text_matrix_tfidf, tfidf_text_vectorizer = get_tfidf_matrix(x_test, 'text',
                                                       preproc_func, vectorizer=tfidf_text_vectorizer)

train_text_matrix_count, count_text_vectorizer = get_tfidf_matrix(x_train, 'text', preproc_func)
test_text_matrix_count, count_text_vectorizer = get_tfidf_matrix(x_test, 'text',
                                                       preproc_func, vectorizer=count_text_vectorizer)

train_text_matrix_tfidf.shape, test_text_matrix_tfidf.shape

((16000, 73574), (4000, 73574))

In [15]:
from scipy import sparse

subject_train_sparse = sparse.csr_matrix(np.array(x_train.subject).reshape(-1, 1))
subject_test_sparse = sparse.csr_matrix(np.array(x_test.subject).reshape(-1, 1))

texts_tfidf_train_matrix = sparse.hstack((train_text_matrix_tfidf,train_title_matrix_tfidf, subject_train_sparse))
texts_count_train_matrix = sparse.hstack((train_text_matrix_count,train_title_matrix_count, subject_train_sparse))
texts_tfidf_test_matrix = sparse.hstack((test_text_matrix_tfidf,test_title_matrix_tfidf, subject_test_sparse))
texts_count_test_matrix = sparse.hstack((test_text_matrix_count,test_title_matrix_count, subject_test_sparse))

texts_tfidf_train_matrix.shape

(16000, 86815)

## LinearSVC with CountVectorizer

In [40]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


parameters = {'C': np.arange(0.01,100,10)}

linear_svc = LinearSVC()

linear_svc_grid_count_clf = GridSearchCV(linear_svc, parameters, verbose=4, scoring='f1_macro', n_jobs=-1, cv=3)
linear_svc_grid_count_clf.fit(texts_count_train_matrix, y_train)

pd.DataFrame(linear_svc_grid_count_clf.cv_results_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.248927,0.020575,0.016852,0.001277,0.01,{'C': 0.01},0.999813,0.999812,0.999812,0.999812,1.657489e-08,10
1,0.990013,0.296787,0.015509,0.000794,10.01,{'C': 10.01},1.0,1.0,1.0,1.0,0.0,1
2,1.180797,0.348384,0.017449,0.00122,20.01,{'C': 20.01},1.0,1.0,1.0,1.0,0.0,1
3,0.900267,0.419103,0.014942,0.00223,30.01,{'C': 30.01},1.0,1.0,1.0,1.0,0.0,1
4,0.778638,0.436642,0.014977,0.00038,40.01,{'C': 40.01},1.0,1.0,1.0,1.0,0.0,1
5,0.782315,0.603195,0.013039,0.003716,50.01,{'C': 50.01},1.0,1.0,1.0,1.0,0.0,1
6,0.697983,0.449111,0.012488,0.00381,60.01,{'C': 60.01},1.0,1.0,1.0,1.0,0.0,1
7,0.6373,0.322011,0.015083,0.001177,70.01,{'C': 70.01},1.0,1.0,1.0,1.0,0.0,1
8,0.670612,0.336999,0.011951,0.00453,80.01,{'C': 80.01},1.0,1.0,1.0,1.0,0.0,1
9,0.542119,0.219744,0.009056,0.002876,90.01,{'C': 90.01},1.0,1.0,1.0,1.0,0.0,1


In [48]:
best_linear_svc_count_clf = LinearSVC(C=0.01)
best_linear_svc_count_clf.fit(texts_count_train_matrix, y_train)

pred = best_linear_svc_count_clf.predict(texts_count_test_matrix)
best_linear_svc_count = classification_report(y_test, pred, digits=4, output_dict=True)

print(classification_report(y_test, pred, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      2000
           1     1.0000    1.0000    1.0000      2000

    accuracy                         1.0000      4000
   macro avg     1.0000    1.0000    1.0000      4000
weighted avg     1.0000    1.0000    1.0000      4000



## LinearSVC with TfidfVectorizer

In [51]:
parameters = {'C': np.arange(0.01,100,10)}

linear_svc = LinearSVC()

linear_svc_grid_tfidf_clf = GridSearchCV(linear_svc, parameters, verbose=4, scoring='f1_macro', n_jobs=-1, cv=3)
linear_svc_grid_tfidf_clf.fit(texts_tfidf_train_matrix, y_train)

pd.DataFrame(linear_svc_grid_tfidf_clf.cv_results_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.429136,0.032973,0.030842,0.000649,0.01,{'C': 0.01},1.0,1.0,1.0,1.0,0.0,1
1,0.932864,0.083978,0.015252,0.000834,10.01,{'C': 10.01},1.0,1.0,1.0,1.0,0.0,1
2,1.176466,0.152602,0.019119,0.001311,20.01,{'C': 20.01},1.0,1.0,1.0,1.0,0.0,1
3,1.299925,0.46557,0.016489,0.000594,30.01,{'C': 30.01},1.0,1.0,1.0,1.0,0.0,1
4,1.148327,0.352092,0.016069,0.00065,40.01,{'C': 40.01},1.0,1.0,1.0,1.0,0.0,1
5,1.068797,0.354642,0.015721,0.000564,50.01,{'C': 50.01},1.0,1.0,1.0,1.0,0.0,1
6,1.040896,0.356669,0.013073,0.004072,60.01,{'C': 60.01},1.0,1.0,1.0,1.0,0.0,1
7,0.843742,0.378233,0.017241,0.000843,70.01,{'C': 70.01},1.0,1.0,1.0,1.0,0.0,1
8,1.165631,0.128912,0.009346,0.003789,80.01,{'C': 80.01},1.0,1.0,1.0,1.0,0.0,1
9,0.723026,0.106263,0.011131,0.002703,90.01,{'C': 90.01},1.0,1.0,1.0,1.0,0.0,1


In [54]:
best_linear_svc_tfidf_clf = LinearSVC(C=0.01)
best_linear_svc_tfidf_clf.fit(texts_tfidf_train_matrix, y_train)

pred = best_linear_svc_tfidf_clf.predict(texts_tfidf_test_matrix)
best_linear_svc_tfidf = classification_report(y_test, pred, digits=4, output_dict=True)

print(classification_report(y_test, pred, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      2000
           1     1.0000    1.0000    1.0000      2000

    accuracy                         1.0000      4000
   macro avg     1.0000    1.0000    1.0000      4000
weighted avg     1.0000    1.0000    1.0000      4000



## Multinomial Naive Bayes with CountVectorizer

In [56]:
from sklearn.naive_bayes import MultinomialNB

parameters = {'alpha': [0, 0.5, 1, 2, 4]}

multi_clf = MultinomialNB()

multi_grid_count_clf = GridSearchCV(multi_clf, parameters, verbose=4, scoring='f1_macro', n_jobs=-1, cv=3)

multi_grid_count_clf.fit(texts_count_train_matrix, y_train)

pd.DataFrame(multi_grid_count_clf.cv_results_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.093609,0.011727,0.035755,0.010381,0.0,{'alpha': 0},0.928691,0.930947,0.926784,0.928808,0.001702,5
1,0.100045,0.019708,0.030619,0.002496,0.5,{'alpha': 0.5},0.979939,0.979184,0.980686,0.979936,0.000613,4
2,0.1113,0.010561,0.030785,0.0035,1.0,{'alpha': 1},0.981439,0.980497,0.981811,0.981249,0.000553,3
3,0.094566,0.003825,0.036468,0.007296,2.0,{'alpha': 2},0.982939,0.98256,0.983311,0.982937,0.000307,2
4,0.064174,0.005111,0.012775,0.000305,4.0,{'alpha': 4},0.984439,0.983497,0.985186,0.984374,0.000691,1


In [63]:
multi_clf = MultinomialNB(alpha=4)

multi_clf.fit(texts_count_train_matrix, y_train)
multi_pred = multi_clf.predict(X=texts_count_test_matrix)

best_multi_count = classification_report(y_test, multi_pred, digits=4, output_dict=True)

print(classification_report(y_test, bayes_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9914    0.9780    0.9846      2000
           1     0.9783    0.9915    0.9849      2000

    accuracy                         0.9848      4000
   macro avg     0.9848    0.9848    0.9847      4000
weighted avg     0.9848    0.9848    0.9847      4000



## Multinomial Naive Bayes with TfidfVectorizer

In [59]:
parameters = {'alpha': [0, 0.5, 1, 2, 4]}

multi_clf = MultinomialNB()

multi_grid_tfidf_clf = GridSearchCV(multi_clf, parameters, verbose=4, scoring='f1_macro', n_jobs=-1, cv=3)

multi_grid_tfidf_clf.fit(texts_tfidf_train_matrix, y_train)

pd.DataFrame(multi_grid_tfidf_clf.cv_results_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.111577,0.001251,0.026264,0.000476,0.0,{'alpha': 0},0.946349,0.949166,0.946147,0.947221,0.001378,5
1,0.111472,0.003779,0.030949,0.005987,0.5,{'alpha': 0.5},0.984249,0.98068,0.985184,0.983371,0.001941,1
2,0.103955,0.003717,0.031454,0.001466,1.0,{'alpha': 1},0.982935,0.978427,0.983683,0.981681,0.002322,2
3,0.108076,0.00472,0.026437,0.001488,2.0,{'alpha': 2},0.979557,0.975045,0.982369,0.97899,0.003017,3
4,0.102255,0.002252,0.017292,0.004102,4.0,{'alpha': 4},0.973547,0.968843,0.976736,0.973042,0.003242,4


In [61]:
multi_clf_tfidf = MultinomialNB(alpha=0.5)

multi_clf_tfidf.fit(texts_tfidf_train_matrix, y_train)
multi_pred_tfidf = multi_clf_tfidf.predict(X=texts_tfidf_test_matrix)

best_multi_tfidf = classification_report(y_test, multi_pred_tfidf, digits=4, output_dict=True)

print(classification_report(y_test, multi_pred_tfidf, digits=4))

              precision    recall  f1-score   support

           0     0.9974    0.9735    0.9853      2000
           1     0.9741    0.9975    0.9857      2000

    accuracy                         0.9855      4000
   macro avg     0.9858    0.9855    0.9855      4000
weighted avg     0.9858    0.9855    0.9855      4000



## Итоги

In [66]:
models = ['Multinomial CountVectorizer', 'Multinomial TfidfVectorizer', 'LinearSVC CountVectorizer', 'LinearSVC TfidfVectorizer']
f1 = []
precision = []
recall = []

for enum, i in enumerate([best_multi_count, best_multi_tfidf, best_linear_svc_count, best_linear_svc_tfidf]):
  # print(enum)
  f1.append(i['macro avg']['f1-score'])
  precision.append(i['macro avg']['precision'])
  recall.append(i['macro avg']['recall'])

pd.DataFrame({'labels':models, 'f1-score macro avg': f1, 'precision macro': precision, 'recall_macro': recall}).sort_values(by='f1-score macro avg', ascending=False).reset_index(drop=True)

Unnamed: 0,labels,f1-score macro avg,precision macro,recall_macro
0,LinearSVC CountVectorizer,1.0,1.0,1.0
1,LinearSVC TfidfVectorizer,1.0,1.0,1.0
2,Multinomial TfidfVectorizer,0.985498,0.98578,0.9855
3,Multinomial CountVectorizer,0.984749,0.984838,0.98475
