# Classic NLP Cheat Sheet (Pre-Transformers)
Русский и английский NLP: TF-IDF, SVM, NB

In [None]:

# Base imports
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report


## Russian preprocessing

In [None]:

from razdel import tokenize
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

def lemmatize_ru(text):
    tokens = [t.text.lower() for t in tokenize(text)]
    lemmas = [
        morph.parse(tok)[0].normal_form
        for tok in tokens
        if tok.isalpha()
    ]
    return " ".join(lemmas)


## TF-IDF setups

In [None]:

word_tfidf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)

char_tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=5,
    max_df=0.9
)


## Pipelines

In [None]:

svm_pipeline = Pipeline([
    ("tfidf", char_tfidf),
    ("clf", LinearSVC(C=0.1, class_weight="balanced"))
])

logreg_pipeline = Pipeline([
    ("tfidf", word_tfidf),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

nb_pipeline = Pipeline([
    ("tf", CountVectorizer(ngram_range=(1,2), min_df=5)),
    ("clf", MultinomialNB())
])


## Training & Evaluation

In [None]:

# Example:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# svm_pipeline.fit(X_train, y_train)
# y_pred = svm_pipeline.predict(X_test)
# print(f1_score(y_test, y_pred, average='macro'))
