In [1]:
import pandas as pd
import seaborn as sns
import re
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
#from sklearn.neural_network import MLPClassifier

%matplotlib inline

In [2]:
from gensim.models import Word2Vec  



In [2]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [3]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32800 entries, 0 to 32799
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      32800 non-null  int64 
 1   text    32709 non-null  object
 2   label   32800 non-null  object
dtypes: int64(1), object(2)
memory usage: 768.9+ KB


In [6]:
train = train.dropna()
test = test.dropna()

In [7]:
train['text'] = train['text'].astype('str')

In [8]:
print('Data cleaning in progress...')

Data cleaning in progress...


In [9]:
train['text_clean'] = train['text'].apply(nltk.word_tokenize)
print('Tokenization complete.')

Tokenization complete.


In [10]:
stop_words=set(nltk.corpus.stopwords.words("russian"))
train['text_clean'] = train['text_clean'].apply(lambda x: [item for item in x if item not in stop_words])
print('Stop words removed.')

Stop words removed.


In [11]:
train.head()

Unnamed: 0,id,text,label,text_clean
0,0,Как отключить тариф?,FAQ - тарифы и услуги,"[Как, отключить, тариф, ?]"
1,1,тариф,мобильная связь - тарифы,[тариф]
2,2,тариф,мобильная связь - тарифы,[тариф]
3,3,Здрасте я хотел получить золотую карту,FAQ - тарифы и услуги,"[Здрасте, хотел, получить, золотую, карту]"
4,4,Золотую карту,FAQ - тарифы и услуги,"[Золотую, карту]"


In [12]:
lem = nltk.stem.wordnet.WordNetLemmatizer()
train['text_clean'] = train['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
print('Lemmatization complete.\nData cleaning complete.\n')

Lemmatization complete.
Data cleaning complete.



In [13]:
map_label = {'FAQ - тарифы и услуги': 0,
            'мобильная связь - тарифы': 1,
            'Мобильный интернет': 2,
            'FAQ - интернет': 3,
            'тарифы - подбор': 4,
            'Баланс': 5,
            'Мобильные услуги': 6,
            'Оплата': 7,
            'Личный кабинет': 8,
            'SIM-карта и номер': 9,
            'Роуминг': 10,
            'запрос обратной связи': 11,
            'Устройства': 12,
            'мобильная связь - зона обслуживания': 13}

In [14]:
train['label'] = train['label'].map(map_label)

## TF-IDF

In [17]:
def Vectorize(vec, X_train, X_test):    
    
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    
    print('Vectorization complete.\n')
    
    return X_train_vec, X_test_vec


In [18]:
X_train, X_test, y_train, y_test = train_test_split(train['text_clean'], train['label'], test_size=0.2, shuffle=True)
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))
X_train_vec, X_test_vec = Vectorize(TfidfVectorizer(), X_train, X_test)

Vectorization complete.



In [25]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier()
}

In [26]:
params = {'Naive Bayes': { 'alpha': [0.5, 1], 'fit_prior': [True, False] },
         'Random Forest': { 'n_estimators': [1000], 'min_samples_split': [2] }
         }

In [27]:
def ML_modeling(models, params, X_train, X_test, y_train, y_test):    
    
    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():
    
        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=5, error_score=0, refit=True, n_jobs=4)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        
        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))

In [28]:
ML_modeling(models, params, X_train_vec, X_test_vec, y_train, y_test)

Naive Bayes : {'alpha': 1, 'fit_prior': False}
Accuracy: 0.758 	Precision: 0.716 	Recall: 0.676 		F1: 0.691

Random Forest : {'min_samples_split': 2, 'n_estimators': 1000}
Accuracy: 0.794 	Precision: 0.781 	Recall: 0.713 		F1: 0.738



In [21]:
boost_classifier = GradientBoostingClassifier()
param_boost = {'learning_rate': [0.05], 'min_samples_split': [2]}
boost_clf_grid = GridSearchCV(boost_classifier, param_grid=param_boost, cv=5, n_jobs=4)

In [22]:
boost_clf_grid.fit(X_train_vec, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=4,
             param_grid={'learning_rate': [0.05], 'min_samples_split': [2]})

In [23]:
predict_boost = boost_clf_grid.predict(X_test_vec)

In [24]:
print(confusion_matrix(y_test,predict_boost))
print(classification_report(y_test,predict_boost))
print(accuracy_score(y_test, predict_boost))

[[ 362  266    1    3    7   13   45    3    0   60    5    0    0    0]
 [  37 2258    3   13    7   29   34    1    2   10    1    1    2    2]
 [   1   59  183    7    0    3   12    0    0    1    1    0    1    2]
 [   1   72    8   85    6    0    6    2    0    3    0    0    0    0]
 [  27  142    0    6  178    3    5    0    0    2    0    0    0    0]
 [   5  104    6    3    1  409   24   17    0   15    0    1    0    0]
 [  27  235    5    1    0   25  455   49    2   34   18    1    0    0]
 [   0   14    0    0    0   23    2  167    0   16    0    0    1    0]
 [   1   24    0    0    0    2   15    0   75    3    0    0    0    0]
 [   3   25    1    0    0    5    7    0    3  487    0    2    0    0]
 [   3    8    0    0    0    0    4    1    0    2   42    0    0    0]
 [   0    7    1    0    0    0    2    0    0   10    0   36    0    0]
 [   0   25    4    0    0    4    1    0    0    3    0    0   59    1]
 [   0    5   13    0    0    0    1    0    0    0

In [None]:
boost_clf_grid.best_params_

In [30]:
param_rf = {'n_estimators': [1000],
    'min_samples_split': [2]}

In [31]:
classifier = RandomForestClassifier()

In [32]:
grid_clf_rf = GridSearchCV(classifier, param_grid=param_rf, cv=5, n_jobs=4)

In [33]:
grid_clf_rf.fit(X_train_vec, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'min_samples_split': [2], 'n_estimators': [1000]})

In [34]:
grid_clf_rf.best_params_

{'min_samples_split': 2, 'n_estimators': 1000}

In [35]:
best_grid_clf_rf = grid_clf_rf.best_estimator_

In [36]:
predict_grid = best_grid_clf_rf.predict(X_test_vec)

In [37]:
print(confusion_matrix(y_test,predict_grid))
print(classification_report(y_test,predict_grid))
print(accuracy_score(y_test, predict_grid))

[[ 434  180    2    3   13   13   29    3    1   42    3    0    0    0]
 [  46 2207    6    9   29   23   31    2    2    7    0    6    4    1]
 [   1   32  220    6    0    2    6    0    0    4    0    0    4    1]
 [   3   34    8  107    9    7    3    0    2    1    0    0    0    0]
 [  21  117    1    5  230    1    5    0    0    0    0    0    0    0]
 [   5   51    5    5    0  508   17   12    4    6    2    0    4    0]
 [  24  104    6    3    3   22  610   12    5   13   11    1    1    0]
 [   1    8    0    0    0   19   18  180    1    4    1    0    1    0]
 [   3   11    0    0    0    5   17    0   87    4    0    0    1    0]
 [   8   13    0    1    0    5    9    2    1  502    0   15    0    0]
 [   4    5    1    1    0    2   10    2    0    2   52    0    1    0]
 [   0    9    0    0    0    2    2    0    0    0    0   49    0    0]
 [   2    8    6    0    0    0    5    0    1    7    0    1   61    0]
 [   0    4   12    0    0    1    1    0    1    0