In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import random
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

from utils import read_csv_data
from cleaner import clean

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)

seed_all(42)

In [3]:
df: pd.DataFrame = read_csv_data('../../data/dataset.csv')

In [4]:
len(df)

764

In [5]:
df['class'] = df['class'].astype('category')
df = df.drop(df[df['class'] == 'other'].index)

encoder = LabelEncoder()
encoder.fit(df['class'])

df['class'] = encoder.transform(df['class'])

In [6]:
df['text'] = df['text'].parallel_apply(clean)

In [7]:
X = df['text']
y = df['class']

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.1, random_state=42)

In [9]:
TfIdf = TfidfVectorizer(max_features=500)
TfIdf.fit(X)
X_train_vectorized = TfIdf.transform(X_train)
X_test_vectorized = TfIdf.transform(X_test)

In [22]:
print(TfIdf.vocabulary_)

{'прийти': 328, 'вчера': 65, 'завтра': 136, 'сегодня': 383, 'ждать': 126, 'угол': 456, 'общий': 260, 'солнце': 403, 'проблема': 339, 'наркотик': 235, 'сказать': 394, 'власть': 47, 'жена': 127, 'улица': 459, 'ты': 454, 'выходить': 70, 'налоговый': 233, 'утро': 465, 'делать': 95, 'нравиться': 255, 'принести': 331, 'хотеть': 480, 'домой': 110, 'гореть': 81, 'энергия': 497, 'покупать': 305, 'просто': 347, 'женщина': 131, 'простой': 348, 'красивый': 185, 'открывать': 273, 'такой': 441, 'ответ': 268, 'заваляться': 135, 'верхний': 35, 'продавец': 343, 'полка': 309, 'сторона': 422, 'казаться': 170, 'посмотреть': 318, 'держать': 99, 'один': 261, 'еврейский': 119, 'сильно': 393, 'пора': 315, 'день': 97, 'спать': 406, 'коньяк': 182, 'студент': 426, 'это': 498, 'россия': 369, 'равно': 356, 'нужный': 256, 'заходить': 154, 'обращаться': 258, 'спасибо': 405, 'телефон': 443, 'мужик': 221, 'суп': 432, 'дорогой': 112, 'первый': 289, 'женский': 130, 'номер': 253, 'третий': 450, 'второй': 64, 'единственны

In [23]:
linear_param_grid = {
    'dual': [False, True],
    'C': [0.001, 0.1, 0.5, 1, 5, 10, 100]
}

grid = GridSearchCV(svm.LinearSVC(), linear_param_grid , scoring='f1_micro', refit=True , verbose=3)
grid.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END ...............C=0.001, dual=False;, score=0.635 total time=   0.0s
[CV 2/5] END ...............C=0.001, dual=False;, score=0.635 total time=   0.0s
[CV 3/5] END ...............C=0.001, dual=False;, score=0.642 total time=   0.0s
[CV 4/5] END ...............C=0.001, dual=False;, score=0.642 total time=   0.0s
[CV 5/5] END ...............C=0.001, dual=False;, score=0.632 total time=   0.0s
[CV 1/5] END ................C=0.001, dual=True;, score=0.635 total time=   0.0s
[CV 2/5] END ................C=0.001, dual=True;, score=0.635 total time=   0.0s
[CV 3/5] END ................C=0.001, dual=True;, score=0.642 total time=   0.0s
[CV 4/5] END ................C=0.001, dual=True;, score=0.642 total time=   0.0s
[CV 5/5] END ................C=0.001, dual=True;, score=0.632 total time=   0.0s
[CV 1/5] END .................C=0.1, dual=False;, score=0.635 total time=   0.0s
[CV 2/5] END .................C=0.1, dual=False;



In [24]:
grid.best_score_

0.639407894736842

In [25]:
grid.best_estimator_

In [26]:
svc_param_grid = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001], 
    'degree' : [1, 2, 3, 4, 5, 6]
}

grid = GridSearchCV(svm.SVC() , svc_param_grid, scoring='f1_micro', refit=True , verbose=3)
grid.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 504 candidates, totalling 2520 fits
[CV 1/5] END C=0.001, degree=1, gamma=1, kernel=linear;, score=0.635 total time=   0.0s
[CV 2/5] END C=0.001, degree=1, gamma=1, kernel=linear;, score=0.635 total time=   0.0s
[CV 3/5] END C=0.001, degree=1, gamma=1, kernel=linear;, score=0.642 total time=   0.0s
[CV 4/5] END C=0.001, degree=1, gamma=1, kernel=linear;, score=0.642 total time=   0.0s
[CV 5/5] END C=0.001, degree=1, gamma=1, kernel=linear;, score=0.632 total time=   0.0s
[CV 1/5] END C=0.001, degree=1, gamma=1, kernel=rbf;, score=0.635 total time=   0.0s
[CV 2/5] END C=0.001, degree=1, gamma=1, kernel=rbf;, score=0.635 total time=   0.0s
[CV 3/5] END C=0.001, degree=1, gamma=1, kernel=rbf;, score=0.642 total time=   0.0s
[CV 4/5] END C=0.001, degree=1, gamma=1, kernel=rbf;, score=0.642 total time=   0.0s
[CV 5/5] END C=0.001, degree=1, gamma=1, kernel=rbf;, score=0.632 total time=   0.0s
[CV 1/5] END C=0.001, degree=1, gamma=1, kernel=poly;, score=0.635 tota

In [27]:
grid.best_score_

0.6499122807017544

In [28]:
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
X_np = X.to_numpy()
y_np = y.to_numpy()
for train, test in kfold.split(X_np, y_np):
	print('train: %s, test: %s' % (len(X_np[train]), len(y_np[test])))

train: 424, test: 106
train: 424, test: 106
train: 424, test: 106
train: 424, test: 106
train: 424, test: 106


In [29]:
def search(X, y, max_features_range: tuple[int, int, int], test_size: float, scoring: str):
    if test_size != 0:
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=42)
    else:
        X_train = X
        y_train = y

    linear_param_grid = {
        'dual': [False, True],
        'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
        
    }

    svc_param_grid = {
        'kernel': ('linear', 'rbf', 'poly'),
        'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001], 
        'degree' : [1, 2, 3, 4, 5, 6]
    }

    start, end, step = max_features_range

    linear_best = (0,)
    svc_best = (0,)

    # kfold = StratifiedKFold(5, shuffle=True, random_state=42)
    # X_np = X.to_numpy()
    # y_np = y.to_numpy()

    for max_features in range(start, end, step):
        # for train, _ in kfold.split(X_np, y_np):
        # X_train = X_np[train]
        # y_train = y_np[train]

        TfIdf = TfidfVectorizer(max_features=max_features)
        TfIdf.fit(X_train)
        X_train_vectorized = TfIdf.transform(X_train)

        # scoring = {'micro': 'f1_micro', 'macro': 'f1_macro'}

        linear_grid = GridSearchCV(svm.LinearSVC(), linear_param_grid, cv=5, scoring=scoring, refit=True , verbose=0)
        linear_grid.fit(X_train_vectorized, y_train)
        if linear_best[0] < linear_grid.best_score_:
            linear_best = (linear_grid.best_score_, linear_grid, max_features)

        svc_grid = GridSearchCV(svm.SVC() , svc_param_grid, scoring=scoring, cv=5, refit=True , verbose=0)
        svc_grid.fit(X_train_vectorized, y_train)
        if svc_best[0] < svc_grid.best_score_:
            svc_best = (svc_grid.best_score_, svc_grid, max_features)
    
    return (linear_best, svc_best)

In [21]:
linear_best, svc_best = search(X, y, (500, 5000, 500), 0.0, 'f1_micro')

ValueError: 
All the 70 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Vill\anaconda3\envs\ods\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Vill\anaconda3\envs\ods\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
TypeError: fit() got an unexpected keyword argument 'max_iter'


In [111]:
linear_best

(0.7169811320754716,
 GridSearchCV(cv=5, estimator=LinearSVC(),
              param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
                          'dual': [False, True]},
              scoring='f1_micro'),
 1000)

In [112]:
svc_best

(0.7132075471698113,
 GridSearchCV(cv=5, estimator=SVC(),
              param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
                          'degree': [1, 2, 3, 4, 5, 6],
                          'gamma': [1, 0.1, 0.01, 0.001],
                          'kernel': ('linear', 'rbf', 'poly')},
              scoring='f1_micro'),
 500)

In [106]:
linear_best, svc_best = search(X, y, (500, 5000, 500), 0.0, 'f1_macro')



In [107]:
linear_best

(0.37600118509401215,
 GridSearchCV(estimator=LinearSVC(),
              param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
                          'dual': [False, True]},
              scoring='f1_macro'),
 500)

In [108]:
svc_best

(0.3843900874976397,
 GridSearchCV(estimator=SVC(),
              param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
                          'degree': [1, 2, 3, 4, 5, 6],
                          'gamma': [1, 0.1, 0.01, 0.001],
                          'kernel': ('linear', 'rbf', 'poly')},
              scoring='f1_macro'),
 500)

In [11]:
# X_train = pd.read_csv('../../data/train.csv')
# y_train = X_train['class']
# X_train.drop(columns=['class'], inplace=True)
# X_train['text'] = X_train['text'].parallel_apply(clean)

# X_test = pd.read_csv('../../data/test.csv')
# y_test = X_test['class']
# X_test.drop(columns=['class'], inplace=True)
# X_test['text'] = X_test['text'].parallel_apply(clean)

129    0
514    0
377    0
638    1
259    0
      ..
102    0
154    0
380    0
615    0
148    1
Name: class, Length: 477, dtype: int32

In [10]:
from sklearn.pipeline import Pipeline

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', svm.LinearSVC(random_state=42))
])

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 3), (1, 3)],
    'svm__dual': [False, True],
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__max_iter': [5000]
}

gs_svm = GridSearchCV(
    svm_pipeline, param_grid, cv=5, verbose=2, scoring='f1_micro', refit=True)
gs_svm = gs_svm.fit(X_train, y_train)

# 'kernel': ('linear', 'rbf', 'poly'),
# 'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
# 'gamma': [1, 0.1, 0.01, 0.001], 
# 'degree' : [1, 2, 3, 4, 5, 6]

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 1); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 2); total time=   0.0s
[CV] END svm__C=0.001, svm__dual=False, svm__max_iter=5000, tfidf__ngram_range=(1, 2); tot

In [11]:
gs_svm.best_score_

0.6414912280701754

In [14]:
from sklearn.pipeline import Pipeline

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', svm.SVC(random_state=42))
])

param_grid = {
    'tfidf__min_df': [1],
    'tfidf__max_df': [1, 2],
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)],
    'svm__kernel': ('linear', 'rbf', 'poly'),
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__gamma': [1, 0.1, 0.01, 0.001], 
    'svm__degree': [1, 2, 3, 4, 5, 6],
    'svm__decision_function_shape': ['ovr', 'ovo'],
    'svm__max_iter': [5000]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=0, scoring='f1_micro', refit=True)
gs_svm = gs_svm.fit(X_train, y_train)

In [18]:
gs_svm.best_score_

0.6373245614035088

In [19]:
gs_svm.best_estimator_

In [23]:
gs_svm.best_estimator_.steps[1][1].__dict__

{'decision_function_shape': 'ovr',
 'break_ties': False,
 'kernel': 'linear',
 'degree': 1,
 'gamma': 1,
 'coef0': 0.0,
 'tol': 0.001,
 'C': 0.001,
 'nu': 0.0,
 'epsilon': 0.0,
 'shrinking': True,
 'probability': False,
 'cache_size': 200,
 'class_weight': None,
 'verbose': False,
 'max_iter': 5000,
 'random_state': 42,
 '_sparse': True,
 'n_features_in_': 1952,
 'class_weight_': array([1., 1., 1.]),
 'classes_': array([0, 1, 2]),
 '_gamma': 1,
 'support_': array([212, 216, 218, 219, 220, 221, 223, 224, 226, 227, 231, 232, 233,
        237, 238, 239, 241, 244, 245, 246, 248, 249, 251, 254, 255, 256,
        261, 264, 265, 266, 267, 270, 271, 272, 274, 275, 277, 279, 280,
        283, 284, 285, 287, 293, 295, 296, 297, 298, 299, 300, 302, 304,
        305, 307, 308, 311, 312, 314, 317, 318, 319, 320, 321, 322, 323,
        325, 326, 327, 328, 329, 330, 332, 334, 335, 337, 338, 339, 340,
        341, 342, 344, 345, 346, 347, 348, 350, 351, 352, 353, 354, 355,
        357, 358, 360, 362, 

In [24]:
gs_svm.best_estimator_.steps[0][1].__dict__

{'input': 'content',
 'encoding': 'utf-8',
 'decode_error': 'strict',
 'strip_accents': None,
 'preprocessor': None,
 'tokenizer': None,
 'analyzer': 'word',
 'lowercase': True,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'stop_words': None,
 'max_df': 1,
 'min_df': 1,
 'max_features': None,
 'ngram_range': (1, 1),
 'vocabulary': None,
 'binary': False,
 'dtype': numpy.float64,
 'norm': 'l2',
 'use_idf': True,
 'smooth_idf': True,
 'sublinear_tf': False,
 '_tfidf': TfidfTransformer(),
 'fixed_vocabulary_': False,
 '_stop_words_id': 140703619202264,
 'stop_words_': {'10',
  '100',
  '12',
  '15',
  '2000',
  '30',
  '500',
  '90',
  'автомат',
  'автомобиль',
  'ад',
  'адвокат',
  'административный',
  'алкоголик',
  'аня',
  'апрель',
  'аптека',
  'ах',
  'баба',
  'бабка',
  'бабуля',
  'бабушка',
  'багажник',
  'бакс',
  'бегать',
  'бедный',
  'бежать',
  'белый',
  'берег',
  'бессмысленный',
  'билет',
  'бить',
  'бл',
  'благодаря',
  'блин',
  'блондинка',
  'бог',
  'богатый',


In [29]:
y_test.tolist()

[1,
 0,
 0,
 2,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 1,
 0,
 1,
 1,
 2,
 0]

In [31]:
y_pred = gs_svm.best_estimator_.predict(X_test.tolist())

In [51]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
f1_score(y_test, y_pred, average='micro')

0.5660377358490566

In [39]:
df.iloc[6]

text     сложный усложнить простой просто женщина
class                                           0
Name: 11, dtype: object

In [40]:
dfRaw: pd.DataFrame = read_csv_data('../../data/dataset.csv')

In [48]:
dfRaw.iloc[469]['text']

'Если еврей сказал, что не брал, значит он не отдаст.'

In [47]:
X_test[469]

'сказать значит отдать брать еврей'

In [50]:
gs_svm.best_estimator_.predict([X_test[469]])

array([0])