# Классификация мусора по комментариям (образование)

In [17]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


Подготовка данных

In [18]:
df = pd.read_csv('data/data.csv', quotechar='"')

In [19]:
df.head()

Unnamed: 0,text,relevant
0,"Единственный в стране, нисколько не сомневаюсь...",1
1,Замечательная женщина и преподаватель!🌺,1
2,"Пересдача,курсовая ,удача 😁не все так плохо в...",1
3,Для потенциальных абитуриентов (студентов) пол...,1
4,"Рейтинг Вуза это конечно здорово, но 5 бюджетн...",1


Предобработка данных

In [20]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^а-я А-Я]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [21]:
text = 'text'

df[text] = df[text].str.lower()
df[text] = df[text].apply(cleanHtml)
df[text] = df[text].apply(cleanPunc)
df[text] = df[text].apply(keepAlpha)
df.head()

Unnamed: 0,text,relevant
0,единственный в стране нисколько не сомневаюсь ...,1
1,замечательная женщина и преподаватель,1
2,пересдача курсовая удача не все так плохо вро...,1
3,для потенциальных абитуриентов студентов получ...,1
4,рейтинг вуза это конечно здорово но бюджетны...,1


Удаление стопслов, стемминг

In [22]:
stop_words = set(stopwords.words('russian'))
stop_words.update(['ок','один','два','три','четыре','пять','шесть','семь','восемь','девять','десять','может','также'])

def removeStopWords(sentence):
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    return re_stop_words.sub(" ", sentence)

df[text] = df[text].apply(removeStopWords)
df.head()

Unnamed: 0,text,relevant
0,единственный стране нисколько сомневаюсь р...,1
1,замечательная женщина преподаватель,1
2,пересдача курсовая удача плохо вроде бы,1
3,потенциальных абитуриентов студентов получить...,1
4,рейтинг вуза это здорово бюджетных мест п...,1


In [23]:
stemmer = SnowballStemmer('russian')
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

df[text] = df[text].apply(stemming)
df.head()

Unnamed: 0,text,relevant
0,единствен стран нискольк сомнева ректор универ...,1
1,замечательн женщин преподавател,1
2,пересдач курсов удач плох врод бы,1
3,потенциальн абитуриент студент получ информац ...,1
4,рейтинг вуз эт здоров бюджетн мест поступлен н...,1


In [24]:
train, test = train_test_split(df, test_size=0.25, random_state=0, shuffle=True)

In [25]:
train_text = train[text]
test_text = test[text]

y_train = train['relevant']
y_test = test['relevant']

In [26]:
train_text.shape, test_text.shape

((32664,), (10889,))

In [27]:
vectorizer = TfidfVectorizer(strip_accents='unicode')
vectorizer.fit(train_text)

Векторизация текстов

In [28]:
X_train = vectorizer.transform(train_text)
X_test = vectorizer.transform(test_text)

In [29]:
def calculate_metrics(y_pred, y_test):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the results
    print('Test accuracy is {}'.format(accuracy))
    print('Test precision is {}'.format(precision))
    print('Test recall is {}'.format(recall))
    print('Test f1 is {}'.format(f1))

def test_model(model, X_train, y_train, X_test, y_test):
    fitted_model = model.fit(X_train, y_train)
    y_pred = fitted_model.predict(X_test)
    calculate_metrics(y_pred, y_test)

Метод опорных векторов

In [30]:
classificatorSVC = SVC(kernel='linear')
test_model(classificatorSVC, X_train, y_train, X_test, y_test)

Test accuracy is 0.7593902103039765
Test precision is 0.7928725456815344
Test recall is 0.8978280426680375
Test f1 is 0.8420925747348119


Дерево решений

In [None]:
classificatorDT = DecisionTreeClassifier(class_weight='balanced')
test_model(classificatorDT, X_train, y_train, X_test, y_test)

Test accuracy is 0.6878501239783268
Test precision is 0.8078111829165496
Test recall is 0.7389795656085336
Test f1 is 0.7718638834821129


Логит. Регрессия

In [None]:
classificatorLR = LogisticRegression(class_weight='balanced')
test_model(classificatorLR, X_train, y_train, X_test, y_test)

Test accuracy is 0.7252272935990449
Test precision is 0.8708378503949202
Test recall is 0.7226577560724843
Test f1 is 0.789858126141312


Градиентный бустинг (Адабуст)

In [None]:
classificatorADA = AdaBoostClassifier()
test_model(classificatorADA, X_train, y_train, X_test, y_test)



Test accuracy is 0.7324823216089632
Test precision is 0.8008652657601978
Test recall is 0.8326693227091634
Test f1 is 0.816457690126646


Случайный лес

In [None]:
classificatorRF = RandomForestClassifier(class_weight='balanced')
test_model(classificatorRF, X_train, y_train, X_test, y_test)

Test accuracy is 0.7588391955184131
Test precision is 0.7885368857046905
Test recall is 0.9052820974167844
Test f1 is 0.8428862031829604


Градиентный бустинг

In [None]:
classificatorGB = GradientBoostingClassifier()
test_model(classificatorGB, X_train, y_train, X_test, y_test)

Test accuracy is 0.7217375332904766
Test precision is 0.721574479992538
Test recall is 0.994216681660455
Test f1 is 0.8362339206572262


К - соседей

In [None]:
classificatorKN = KNeighborsClassifier()
test_model(classificatorKN, X_train, y_train, X_test, y_test)

Test accuracy is 0.33428230324180364
Test precision is 0.9182389937106918
Test recall is 0.07505462022876237
Test f1 is 0.13876678151360342


Проведем кросс-валидацию двух лучших моделей

In [9]:
def cross_validate(model, X_train, X_test, y_train, y_test) -> None:

    """
    Performs 5-fold cross-validation for a classification model, using TF-IDF for text vectorization.

    Args:
        model: The classification model to be trained.
        X_train: Array of training data (text data).
        X_test: Array of test data (text data).
        y_train: Vector of class labels for the training data.
        y_test: Vector of class labels for the test data.

    Returns:
        None. Prints the cross-validation results to the console.
    """

    # Define per-fold score containers
    f1_per_fold = []
    recall_score_per_fold = []
    precison_score_per_fold = []

    # Merge inputs and targets
    inputs = np.concatenate((X_train, X_test), axis=0)
    targets = np.concatenate((y_train, y_test), axis=0)

    # Define the K-fold Cross Validator
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    for train, test in kfold.split(inputs, targets):
        vectorizer = TfidfVectorizer(strip_accents='unicode')
        vectorizer.fit(inputs[train])
        X_train = vectorizer.transform(inputs[train])
        X_test = vectorizer.transform(inputs[test])

        y_train = targets[train]
        y_test = targets[test]

        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # Fit data to model
        model_fitted = model.fit(X_train, y_train)

        # Generate generalization metrics
        y_pred = model_fitted.predict(X_test)

        f1_per_fold.append(f1_score(y_test, y_pred))
        precison_score_per_fold.append(precision_score(y_test, y_pred))
        recall_score_per_fold.append(recall_score(y_test, y_pred))

        # Increase fold number
        fold_no = fold_no + 1

    # == Provide average scores ==
    print('------------------------------------------------------------------------')
    print('Score per fold')
    for i in range(0, len(f1_per_fold)):
        print('------------------------------------------------------------------------')
        print(
            f'''
            > Fold {i+1}
            - F1: {f1_per_fold[i]}
            - Precision: {precison_score_per_fold[i]}
            - Recall: {recall_score_per_fold[i]}
            ''')
    print('------------------------------------------------------------------------')
    print('Average scores for all folds:')
    print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
    print(f'> Precision: {np.mean(precison_score_per_fold)} (+- {np.std(precison_score_per_fold)})')
    print(f'> Recall: {np.mean(recall_score_per_fold)} (+- {np.std(recall_score_per_fold)})')
    print('------------------------------------------------------------------------')

In [10]:
classificatorSVC = SVC(kernel='linear')
classificatorRF = RandomForestClassifier(class_weight='balanced')

In [11]:
texts = df['text'].astype(str)
relevance = df['relevant']
X_train, X_test, y_train, y_test = train_test_split(texts, relevance, test_size=0.25, random_state=42)


In [12]:
cross_validate(classificatorSVC, X_train, X_test, y_train, y_test)

------------------------------------------------------------------------
Training for fold 1 ...
------------------------------------------------------------------------
Training for fold 2 ...
------------------------------------------------------------------------
Training for fold 3 ...
------------------------------------------------------------------------
Training for fold 4 ...
------------------------------------------------------------------------
Training for fold 5 ...
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------

            > Fold 1
            - F1: 0.8418924988770774
            - Precision: 0.7894145725115822
            - Recall: 0.9018444266238973
            
------------------------------------------------------------------------

            > Fold 2
            - F1: 0.8448755545529739
            - Precision: 0.7954127141441314
            - Recall:

In [13]:
cross_validate(classificatorRF, X_train, X_test, y_train, y_test)


------------------------------------------------------------------------
Training for fold 1 ...
------------------------------------------------------------------------
Training for fold 2 ...
------------------------------------------------------------------------
Training for fold 3 ...
------------------------------------------------------------------------
Training for fold 4 ...
------------------------------------------------------------------------
Training for fold 5 ...
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------

            > Fold 1
            - F1: 0.8420267144242967
            - Precision: 0.7873290538654758
            - Recall: 0.9048917401764234
            
------------------------------------------------------------------------

            > Fold 2
            - F1: 0.8429690414024618
            - Precision: 0.7881154972799553
            - Recall:

В итоге наилучшей моделью (из классических) для прогнозирования релевантности текстов стала модель классификатора на основе метода опорных векторов. Она имеет наивысший f1, а также высокий recall около 0.9, что отлично подходит для нашей задачи.


Эту модель можно улучшить, путем подбора гиперпараметров (например методом поиска по сетке). Однако за неимением вычислительных мощностей остановимся на базовом варианте модели.