In [20]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from warnings import filterwarnings
filterwarnings('ignore')

In [7]:
df = pd.read_pickle('data/prp_df.pkl')

### Columns:

* **text_id** - ID of initial text
* **text** - initial raw text
* **comment** - initial raw comment to text
* **prp_text** - preprocessed cleaned text
* **prp_com** - preprocessed cleaned comment
* **score** - initial labels
* **text_words_qty** - number of words in preprocessed text
* **comment_words_qty** - number of words in preprocessed comment
* **repeat_words** - number of repeated words in text and in comment
* **repeat_rate_words** - ratio of repeated words to the number of words in comment
* **link** - if there is a link in comment
* **quoted** - if there is a quotaion in comment
* **wr_sum** - sum of word weights in comment (weight are based on popularity)
* **wr_len** - number of popular words in comment
* **wr_rate** - ratio of *wr_sum* to *wr_len*
* **wr_rate_tot** - ratio of *wr_sum* to *comment_words_qty*

### Making labels for binary classification

In [8]:
df['lbl_best'] = np.where(df['score']==0, 1, 0) # mark as 1 only best comments
df['lbl_worst'] = np.where(df['score']==4, 1, 0) # mark as 1 only worst comments

In [9]:
df.head(6)

Unnamed: 0,text_id,text,comment,prp_text,prp_com,score,text_words_qty,comment_words_qty,repeat_words,repeat_rate_words,link,quoted,wr_sum,wr_len,wr_rate,wr_rate_tot,lbl_best,lbl_worst
0,0,0^0,- &quot;0^0. Why? Because mathematicians said ...,0 0,quot 0 0 mathematician said really true quot d...,3,2,35,1,0.028571,0,1,292342,17,17196.587891,8352.628906,0,0
1,0,0^0,It&#x27;s very important to note here that 0^0...,0 0,x27 important note 0 0 1 shorthand truth mathe...,0,2,174,1,0.005747,0,1,1005446,64,15710.09375,5778.425293,1,0
2,0,0^0,A word from Knuth on the matter (warning: PDF)...,0 0,word knuth matter warning pdf http x2f x2f arx...,4,2,19,0,0.0,1,0,94678,7,13525.428711,4983.052734,0,1
3,0,0^0,Students: Let&#x27;s come up with some crazy p...,0 0,student let x27 come crazy proof based individ...,1,2,28,0,0.0,0,0,270892,19,14257.473633,9674.713867,0,0
4,0,0^0,The real problem here is that x^y is a single ...,0 0,real problem x single shorthand refers fundame...,2,2,83,1,0.012048,0,0,345465,27,12795.0,4162.229004,0,0
5,1,The $0.001 DIY iPhone 4 Antenna Fix,Is this a joke that I'm not getting? Scotch ta...,0 001 diy iphone 4 antenna fix,joke getting scotch tape really problem real f...,0,7,23,1,0.043478,0,0,247597,14,17685.5,10765.086914,1,0


### Spliting the dataset

To speed up training we will use 30% of dataset.  
20% we will keep for test.

In [10]:
total_texts = df['text_id'].max() + 1
print(f'Total number of texts: {total_texts}')
print(f'30% of dataset, text_index <= {total_texts*0.3:.0f}')
print(f'20% for test, {total_texts*0.3:.0f} > text_index <= {total_texts*0.5:.0f}')

Total number of texts: 85987
30% of dataset, text_index <= 25796
20% for test, 25796 > text_index <= 42994


In [11]:
df_train = df[df['text_id']<=25796]
df_test = df[(df['text_id']>25796)&(df['text_id']<=42994)]

### Choosing models

We will try 2 approaches to data that we have:
1. **Feature based approach:**  
  
- We saw with EDA that best and worst comments may be separated by strong model, e.g. boosting.  
- Lets use 2 models which will be trained for binary classification to identify best and worst comments based on set of features. 
- With *predict_proba* we will get sorted ranks of comments made by each model.  

2. **Context based approach:**  
  
- Our first approach takes into account only technical parameters of texts and comments.    
- To overview our data completely we should process texts and comments from contextual point of view.    
- Lets use pretrained NLP models from BERT family to build contextual embeddings.  
- After to save time we will train 2 SLP models (single layer perceptron) for binary classification to identify best and worst comments.
  
**Finally with 4 models we will get votes for the rank of comment.**

### Metric
For measuaring the quality we will use NDCG metric.  
We will compair NDCG of our approach with random metric.

### 1. Boosting

a. Prediction of best comments:

In [12]:
y = df_train['lbl_best']
X = df_train[['text_id', 
              'text_words_qty', 
              'comment_words_qty', 
              'repeat_words', 
              'repeat_rate_words', 
              'link', 'quoted', 
              'wr_sum', 
              'wr_len', 
              'wr_rate', 
              'wr_rate_tot'
             ]]

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=111)

cats = ['link', 'quoted']

ctbs = []

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    ctbs.append(CatBoostClassifier(iterations=300, verbose=50, early_stopping_rounds=20))
    ctbs[i].fit(X.loc[train_index].drop(['text_id'], axis=1), y.loc[train_index], 
                eval_set=(X.loc[test_index].drop(['text_id'], axis=1), y.loc[test_index]),
                cat_features=cats)


Learning rate set to 0.168162
0:	learn: 0.6149203	test: 0.6146569	best: 0.6146569 (0)	total: 171ms	remaining: 51.2s
50:	learn: 0.4551979	test: 0.4579585	best: 0.4579152 (41)	total: 906ms	remaining: 4.42s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4579152188
bestIteration = 41

Shrink model to first 42 iterations.
Learning rate set to 0.168162
0:	learn: 0.6147086	test: 0.6152830	best: 0.6152830 (0)	total: 13.7ms	remaining: 4.09s
50:	learn: 0.4549483	test: 0.4599954	best: 0.4597638 (31)	total: 755ms	remaining: 3.69s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4597638211
bestIteration = 31

Shrink model to first 32 iterations.
Learning rate set to 0.168162
0:	learn: 0.6148458	test: 0.6149988	best: 0.6149988 (0)	total: 35.9ms	remaining: 10.7s
50:	learn: 0.4552672	test: 0.4573549	best: 0.4572746 (44)	total: 828ms	remaining: 4.04s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4572314959
bestIteration = 61

Shrink model to

In [14]:
ctbs[0].get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,comment_words_qty,48.544777
1,wr_sum,12.128198
2,wr_len,11.638581
3,wr_rate_tot,6.277957
4,link,6.069504
5,repeat_rate_words,3.925666
6,repeat_words,3.904905
7,wr_rate,3.896895
8,text_words_qty,2.439242
9,quoted,1.174273


b. Prediction of worst comments:

In [15]:
y_w = df_train['lbl_worst']

In [16]:
ctbs_w = []

for i, (train_index, test_index) in enumerate(skf.split(X, y_w)):
    ctbs_w.append(CatBoostClassifier(iterations=300, verbose=50, early_stopping_rounds=20))
    ctbs_w[i].fit(X.loc[train_index].drop(['text_id'], axis=1), y_w.loc[train_index], 
                eval_set=(X.loc[test_index].drop(['text_id'], axis=1), y_w.loc[test_index]),
                cat_features=cats)

Learning rate set to 0.168162
0:	learn: 0.6214494	test: 0.6221212	best: 0.6221212 (0)	total: 17.1ms	remaining: 5.11s
50:	learn: 0.4710223	test: 0.4758597	best: 0.4757286 (33)	total: 795ms	remaining: 3.88s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4757286064
bestIteration = 33

Shrink model to first 34 iterations.
Learning rate set to 0.168162
0:	learn: 0.6207162	test: 0.6205951	best: 0.6205951 (0)	total: 14.7ms	remaining: 4.38s
50:	learn: 0.4716765	test: 0.4738076	best: 0.4737475 (40)	total: 815ms	remaining: 3.98s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4737474859
bestIteration = 40

Shrink model to first 41 iterations.
Learning rate set to 0.168162
0:	learn: 0.6207212	test: 0.6205020	best: 0.6205020 (0)	total: 13.7ms	remaining: 4.09s
50:	learn: 0.4715649	test: 0.4743990	best: 0.4741823 (31)	total: 818ms	remaining: 4s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4741822845
bestIteration = 31

Shrink model to f

In [17]:
ctbs_w[0].get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,comment_words_qty,50.892024
1,wr_len,12.651188
2,wr_sum,9.999493
3,link,6.607928
4,wr_rate_tot,4.705328
5,repeat_words,3.71155
6,wr_rate,3.637839
7,repeat_rate_words,3.408673
8,text_words_qty,2.682051
9,quoted,1.703927


In [19]:
# Saving lists of models

with open('data/ML_model_best.pickle', 'wb') as pkl:
    pickle.dump(ctbs, pkl)  

with open('data/ML_model_worst.pickle', 'wb') as pkl:
    pickle.dump(ctbs_w, pkl)  