In [40]:
import pickle
import pandas as pd

data_path = '../data/'

with open(f'{data_path}last_model.txt', 'r') as f:
    vectorizer_file = f.read().split('\n')[0]

with open('../' + vectorizer_file, 'rb') as f:
    vectorizer = pickle.load(f)

data = pd.read_csv(f'{data_path}messages_with_cluster.csv')
data.head(10)

Unnamed: 0,message,cluster
0,хай,0
1,сделать свой стиль,22
2,скоро удалять приложение нафиг,25
3,окей,0
4,работать,29
5,сделать сеть недавно,28
6,привет,7
7,сало,0
8,спасибо помогать,0
9,здравствовать грузиться список диалог,0


In [42]:
data.dropna(inplace=True)
data.isna().sum()

message    0
cluster    0
dtype: int64

In [43]:
cluster_to_class = {
    1: 'hi', 5: 'hi', 7: 'hi', 2: 'greetings', 4: 'encryption',
    3: 'thanks', 11: 'thanks', 12: 'thanks', 13: 'thanks', 
    15: 'thanks', 20: 'thanks', 8: 'music', 14: 'suggestion',
    16: 'question', 22: 'my_profile', 6: 'no_answer_needed', 
    26: 'no_answer_needed', 28: 'offline'
}

data['class'] = data['cluster'].apply(lambda cl_i: cluster_to_class[cl_i] if cl_i in cluster_to_class else 'undefined')
data.drop(columns=['cluster'], inplace=True)
data.head(10)

Unnamed: 0,message,class
0,хай,undefined
1,сделать свой стиль,my_profile
2,скоро удалять приложение нафиг,undefined
3,окей,undefined
4,работать,undefined
5,сделать сеть недавно,offline
6,привет,hi
7,сало,undefined
8,спасибо помогать,undefined
9,здравствовать грузиться список диалог,undefined


In [44]:
from sklearn.utils import class_weight
import numpy as np

unique_classes = np.unique(data['class'])
class_weights = list(class_weight.compute_class_weight('balanced',
                                             unique_classes,
                                             data['class']))
unique_classes, class_weights

(array(['encryption', 'greetings', 'hi', 'music', 'my_profile',
        'no_answer_needed', 'offline', 'question', 'suggestion', 'thanks',
        'undefined'], dtype=object),
 [23.106060606060606,
  3.080808080808081,
  2.390282131661442,
  7.702020202020202,
  6.301652892561983,
  7.29665071770335,
  23.106060606060606,
  17.329545454545453,
  27.727272727272727,
  1.3726372637263726,
  0.11207466745057691])

In [45]:
class_ohe = pd.get_dummies(data['class'], prefix='class')
class_ohe.drop(columns=['class_undefined'], inplace=True) # drop_first but drop_undefined
class_ohe.head(10)

Unnamed: 0,class_encryption,class_greetings,class_hi,class_music,class_my_profile,class_no_answer_needed,class_offline,class_question,class_suggestion,class_thanks
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0


In [46]:
tfidf_vectors = vectorizer.transform(data['message'])
tfidf_vectors

<1525x2138 sparse matrix of type '<class 'numpy.float64'>'
	with 6963 stored elements in Compressed Sparse Row format>

In [47]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)
data_pca = pca.fit_transform(tfidf_vectors.todense())
data_pca.shape

(1525, 950)

In [49]:
data_pca.shape, class_ohe.shape

((1525, 950), (1525, 10))

In [104]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier

xgb_params = {
    'random_state': [289],
    'n_estimators': [50, 100, 200, 400, 500],
    'depth': [None, 1, 2, 3],
    'sample_weight': [None, class_weights[:-1]]  # exclude undefined
}

et_params = {
    'random_state': [289],
    'n_estimators': [50, 100],
    'max_depth': [None, 6, 7 ,8],
    'class_weight': [None, 'balanced'],
#     'average': ['micro', 'macro', 'weighted']
}

lr_params = {
    'random_state': [289],
    'class_weight': ['balanced', None],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs']
}

knn_params = {
    'n_neighbors': [1, 3, 5],
    'weights': ['uniform', 'distance'],
    'n_jobs': [-1],
    'p': [1, 2]
}

gs = GridSearchCV(KNeighborsClassifier(), knn_params, verbose=10, scoring=make_scorer(f1_score, average='macro'), cv=5, n_jobs=-1)
gs.fit(data_pca, class_ohe)

results = pd.DataFrame()
for k, v in gs.cv_results_.items():
    results[k] = v
results = results.sort_values(by='rank_test_score')

best_params_row = results[results['rank_test_score'] == 1]
mean, std = best_params_row['mean_test_score'].iloc[0], best_params_row['std_test_score'].iloc[0]
best_params = best_params_row['params'].iloc[0]
print(f'{mean:.3f} ({std:.4f}) with {best_params}')

xgb = KNeighborsClassifier()
xgb.set_params(**best_params)
xgb.fit(data_pca, class_ohe)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   22.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   22.8s finished


0.780 (0.1271) with {'n_jobs': -1, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
                     weights='distance')

In [105]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_jobs,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.280907,0.052682,0.718993,0.044535,-1,3,2,distance,"{'n_jobs': -1, 'n_neighbors': 3, 'p': 2, 'weig...",0.937302,0.56381,0.739316,0.865714,0.795833,0.780395,0.12707,1
2,0.338828,0.139218,1.045937,0.139854,-1,1,2,uniform,"{'n_jobs': -1, 'n_neighbors': 1, 'p': 2, 'weig...",0.951261,0.530476,0.721581,0.865714,0.786185,0.771043,0.142821,2
3,0.44179,0.111495,0.980833,0.09115,-1,1,2,distance,"{'n_jobs': -1, 'n_neighbors': 1, 'p': 2, 'weig...",0.951261,0.530476,0.721581,0.865714,0.786185,0.771043,0.142821,2
6,0.368693,0.120702,0.993048,0.232652,-1,3,2,uniform,"{'n_jobs': -1, 'n_neighbors': 3, 'p': 2, 'weig...",0.851587,0.56381,0.739316,0.854603,0.795833,0.76103,0.107271,4
11,0.487533,0.157955,0.868316,0.188605,-1,5,2,distance,"{'n_jobs': -1, 'n_neighbors': 5, 'p': 2, 'weig...",0.923016,0.566667,0.697778,0.815714,0.740342,0.748703,0.118892,5
0,0.203471,0.110528,0.880117,0.031403,-1,1,1,uniform,"{'n_jobs': -1, 'n_neighbors': 1, 'p': 1, 'weig...",0.857778,0.561111,0.683229,0.832436,0.734664,0.733844,0.107219,6
1,0.266097,0.096204,0.854876,0.293061,-1,1,1,distance,"{'n_jobs': -1, 'n_neighbors': 1, 'p': 1, 'weig...",0.857778,0.561111,0.683229,0.832436,0.734664,0.733844,0.107219,6
5,0.352,0.042031,1.142842,0.219687,-1,3,1,distance,"{'n_jobs': -1, 'n_neighbors': 3, 'p': 1, 'weig...",0.84619,0.597143,0.686848,0.784678,0.750229,0.733018,0.085284,8
10,0.376827,0.038494,1.193406,0.173822,-1,5,2,uniform,"{'n_jobs': -1, 'n_neighbors': 5, 'p': 2, 'weig...",0.837302,0.566667,0.697778,0.804603,0.726056,0.726481,0.094603,9
9,0.279218,0.079399,0.667333,0.079245,-1,5,1,distance,"{'n_jobs': -1, 'n_neighbors': 5, 'p': 1, 'weig...",0.839957,0.597143,0.670975,0.754678,0.665705,0.705692,0.083683,10


In [106]:
real = [
    'привет', 
    'здравствуйте',
    'спасибо',
    'а как посмотреть свой профиль?',
    'а как посмотреть свою страницу?'
]

tf = vectorizer.transform(real)
pc = pca.transform(tf.todense())
pred = xgb.predict(pc)
pred_class = np.argmax(pred, axis=1)
pred_sum = np.sum(pred, axis=1)
for i in range(pred.shape[0]):
    if pred_sum[i] == 0:
        print('undefined')
    else:
        print(class_ohe.columns.to_numpy()[pred_class[i]])

class_hi
undefined
class_thanks
class_my_profile
undefined
