# Build plain Random Forest classifier on Tf-Idf

In [137]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

# https://www.kaggle.com/code/neerajmohan/nlp-text-classification-using-tf-idf-features

t = pd.read_excel(f'{sdir}t4m_2022_05_05__21_59_v3.xlsx')

df = t[['text', 'class_name']].copy()
df['label'] = df['class_name'].map(cls2idx)
df['label'].value_counts()

tmp = df[['text','label']].copy()

data2 = tmp.copy()


data2 = data2[['text', 'label']].set_axis(['headline', 'label'], axis=1)
data2=Dataset.from_pandas(data2)

# 80% train, 20% test + validation
train_testvalid = data2.train_test_split(test_size=0.3,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

# gather everything to have a single DatasetDict
data2 = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train'],
                   })

tfidf_vectorizer = TfidfVectorizer() 

Xy_train = train_testvalid['train'].to_pandas()
Xy_test = test_valid['test'].to_pandas()
Xy_val = test_valid['train'].to_pandas()

X_train, y_train = Xy_train['headline'], Xy_train['label']
X_val, y_val = Xy_val['headline'], Xy_val['label']
X_test, y_test = Xy_test['headline'], Xy_test['label']

X_train = pd.concat([X_train, X_val], ignore_index=True)
y_train = y_train.values.tolist() + y_val.values.tolist()

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)

tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [138]:
classifier = RandomForestClassifier()

classifier.fit(tfidf_train_vectors,y_train)

In [139]:
y_pred = classifier.predict(tfidf_test_vectors)
print(classification_report(y_test,y_pred, target_names=[v.replace('_', ' ') for i,v in idx2cls.items()]))

                                           precision    recall  f1-score   support

                               обновление       0.75      0.60      0.67        25
                                   другое       0.55      0.70      0.62        47
                                     цена       0.76      0.65      0.70        20
                               лояльность       0.75      0.87      0.81        46
                          создание заказа       0.38      0.50      0.43        28
                             обслуживание       0.45      0.36      0.40        28
                 долгое ожидание доставки       0.60      0.50      0.55        24
                       глюки баги тормоза       0.53      0.56      0.55        48
                                   купоны       0.65      0.76      0.70        17
                           доставка общее       0.70      0.69      0.70        45
                                  аккаунт       0.81      0.71      0.76        24
   

# Build CatBoostClassifier on Tf-Idf

In [140]:
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

tfidf_vectorizer = TfidfVectorizer() 

Xy_train = train_testvalid['train'].to_pandas()

###############

Xy = Xy_train.copy()
target = 'headline'

cnts = pd.DataFrame(Xy['label'].value_counts()).reset_index().set_axis(['cname', 'cnt'], axis=1).\
    assign(needed=lambda row: row['cnt'].max()-row['cnt'])

rows2add = []
for ind, row in cnts.iterrows():
    smpl = Xy.query(f'label == {row["cname"]}').sample(row['needed'], replace=True)
    for ind2, row2 in smpl.iterrows():
        rows2add.append([random_insert2str(row2[target]), row2['label']])

Xy = pd.concat([Xy, pd.DataFrame(rows2add, columns = Xy.columns)]).sample(frac=1)

Xy_train = Xy.copy()

###############


Xy_test = test_valid['test'].to_pandas()
Xy_val = test_valid['train'].to_pandas()

X_train, y_train = Xy_train['headline'], Xy_train['label']
X_val, y_val = Xy_val['headline'], Xy_val['label']
X_test, y_test = Xy_test['headline'], Xy_test['label']

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
tfidf_val_vectors = tfidf_vectorizer.transform(X_val)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

params_cat = {
      'eval_metric':'TotalF1', 
      'iterations':5000,
      'verbose':500,
      'early_stopping_rounds':550,
      'border_count': 254,
      'use_best_model':True,
      'class_weights':class_weights,
    'task_type':'CPU'
  }


model = CatBoostClassifier(**params_cat)
preds = model.fit(tfidf_train_vectors, y_train,
                 eval_set=(tfidf_val_vectors, y_val)
                 ).predict(tfidf_test_vectors)

print(classification_report(y_test, preds, target_names=[v.replace('_', ' ') for i,v in idx2cls.items()]))

Learning rate set to 0.059766
0:	learn: 0.2616644	test: 0.2360951	best: 0.2360951 (0)	total: 121ms	remaining: 10m 7s
500:	learn: 0.7799458	test: 0.6024453	best: 0.6041279 (498)	total: 21s	remaining: 3m 8s
1000:	learn: 0.8255710	test: 0.6071514	best: 0.6108478 (949)	total: 41.7s	remaining: 2m 46s
1500:	learn: 0.8532676	test: 0.6161412	best: 0.6186343 (1375)	total: 1m 2s	remaining: 2m 25s
2000:	learn: 0.8677961	test: 0.6188620	best: 0.6188620 (1989)	total: 1m 23s	remaining: 2m 4s
2500:	learn: 0.8800350	test: 0.6189548	best: 0.6205092 (2312)	total: 1m 43s	remaining: 1m 43s
3000:	learn: 0.8954127	test: 0.6204597	best: 0.6213445 (2838)	total: 2m 4s	remaining: 1m 22s
Stopped by overfitting detector  (550 iterations wait)

bestTest = 0.6213444566
bestIteration = 2838

Shrink model to first 2839 iterations.
                                           precision    recall  f1-score   support

                               обновление       0.69      0.88      0.77        25
                      

# Prepare for random insert

In [1]:
rand_ins = [
    'app store',
    'google play',
    'burger king',
#     'macdonalds',
#     'rostics',
#     'kfc',
    'ресторан стоит',
    'телефон работает',
    'еду кушают',
    'ночью спят',
    'днем работают',
    'обед днем',
    'завтрак утром',
    'ужин вечером'
]


In [48]:
def random_insert2str(s0):
    
    s = s0.strip()
    spaces = [i for i, ltr in enumerate(s) if ltr == ' ']
    inserts = []
    if len(spaces) > 0:
        inserts=np.random.choice(rand_ins, len(spaces)//5 + 1)
    else:
        return s + ' ' + np.random.choice(rand_ins, 1)[0]

    spaces2ins = np.random.choice(spaces, len(inserts))

    ss = ''
    i0 = 0
    for ind, i in enumerate(spaces2ins):
        ss += s[i0:i] + f' {inserts[ind]}'
        i0 = i
    if i0 < len(s):
        ss += s[i0:]
    return ss

# Load file and define function to compare F1 with different data

In [110]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from datasets import load_dataset,Dataset,DatasetDict, load_metric
import numpy as np
import pandas as pd

# https://www.kaggle.com/code/neerajmohan/nlp-text-classification-using-tf-idf-features

t = pd.read_excel(f'{sdir}t4m_2022_05_05__21_59_v3.xlsx')
cnames = list(set(t['class_name'].values))
idx2cls = {i:v for i,v in enumerate(cnames)}
cls2idx = {v:i for i,v in enumerate(cnames)}

df = t[['content', 'text_1', 'text_2', 'text_3_0', 'text_3', 'text_5', 'text_6', 'class_name']].copy()
df['label'] = df['class_name'].map(cls2idx)
df['label'].value_counts()

tmp = df[['content', 'text_1', 'text_2', 'text_3_0', 'text_3', 'text_5', 'text_6','label']].copy()


data2 = tmp.copy()


data2 = data2[['content', 'text_1', 'text_2', 'text_3_0', 'text_3', 'text_5', 'text_6', 'label']].copy()
data2=Dataset.from_pandas(data2)

# 80% train, 20% test + validation
train_testvalid = data2.train_test_split(test_size=0.3,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

# gather everything to have a single DatasetDict
data2 = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train'],
                   })

# tfidf_vectorizer = TfidfVectorizer() 

Xy_train = train_testvalid['train'].to_pandas()
Xy_test = test_valid['test'].to_pandas()
Xy_val = test_valid['train'].to_pandas()


def score_text(target, out=False, random_insert=False):
    X_train, y_train = Xy_train[target], Xy_train['label']
    X_val, y_val = Xy_val[target], Xy_val['label']
    X_test, y_test = Xy_test[target], Xy_test['label']

    X_train = pd.concat([X_train, X_val], ignore_index=True)
    y_train = y_train.values.tolist() + y_val.values.tolist()
    

    ##################################

    if random_insert:
        Xy = pd.DataFrame({target:X_train.values.tolist(), 'label':y_train})

        cnts = pd.DataFrame(Xy['label'].value_counts()).reset_index().set_axis(['cname', 'cnt'], axis=1).\
            assign(needed=lambda row: row['cnt'].max()-row['cnt'])

        rows2add = []
        for ind, row in cnts.iterrows():
            smpl = Xy.query(f'label == {row["cname"]}').sample(row['needed'], replace=True)
            for ind2, row2 in smpl.iterrows():
                rows2add.append([random_insert2str(row2[target]), row2['label']])

        Xy = pd.concat([Xy, pd.DataFrame(rows2add, columns = Xy.columns)]).sample(frac=1)

        X_train = Xy[target]
        y_train = Xy['label'].values.tolist()

    ##################################

    tfidf_vectorizer = TfidfVectorizer() 
    
    tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)

    tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

    classifier = RandomForestClassifier()

    classifier.fit(tfidf_train_vectors,y_train)

    y_pred = classifier.predict(tfidf_test_vectors)
    if out: print(classification_report(y_test,y_pred))
    
    return classification_report(y_test,y_pred, output_dict=True, target_names=list(cls2idx.keys()))

def score_text2(target, n=15, random_insert=False):
    res = []
    for i in range(n):
        res.append(score_text(target, random_insert=random_insert)['macro avg']['f1-score'])
        
    return np.mean(np.array(res)), np.std(np.array(res))

# Find out F1 increment for each data preparation step

In [107]:
for param in ['content', 'text_1', 'text_2', 'text_3_0', 'text_3', 'text_5', 'text_6']:
    f,s = score_text2(param, n=25)
    print(f'testing {param}: F1 = {f:.3f} ± {s:.3f}')

testing content: F1 = 0.628 ± 0.011
testing text_1: F1 = 0.630 ± 0.011
testing text_2: F1 = 0.627 ± 0.009
testing text_3_0: F1 = 0.644 ± 0.010
testing text_3: F1 = 0.641 ± 0.009
testing text_5: F1 = 0.645 ± 0.009


# Random insert increment

In [115]:
param = 'text_5'
f,s = score_text2(param, n=5, random_insert=True)
print(f'testing {param}: F1 = {f:.3f} ± {s:.3f}')

testing text_5: F1 = 0.651 ± 0.009


# Random insert increment wo business replacements

In [111]:
param = 'text_6'
f,s = score_text2(param, n=5, random_insert=True)
print(f'testing {param}: F1 = {f:.3f} ± {s:.3f}')

testing text_6: F1 = 0.651 ± 0.003
