In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
pd.set_option('max_colwidth', 128)
sns.set(rc={'figure.figsize':(16, 6)})

In [3]:
# Load initial Dataset

# Web link for data https://www.kaggle.com/datasets/talymeal/vk-ds-cup
df_json = pd.read_json('data/ranking_train.jsonl', lines=True)
df_raw = df_json.explode('comments').reset_index(drop=True)
df_comments = pd.json_normalize(df_raw['comments'])
df_comments.rename(columns={'text': 'comment'}, inplace=True)
df_train = pd.concat([df_raw, df_comments], axis=1)
df_train.drop(columns='comments', inplace=True)


In [4]:
print(f'Total number of articles: {int(df_train.shape[0] / 5)}')
print(f'Total number of comments: {int(df_train.shape[0])}')

Total number of articles: 88107
Total number of comments: 440535


In [5]:
#  Check missing data

df_train.isna().sum()

text       0
comment    0
score      0
dtype: int64

In [6]:
# Check duplicates in data

df_train.duplicated().sum()

0

In [7]:
df_train.head(11)

Unnamed: 0,text,comment,score
0,How many summer Y Combinator fundees decided not to continue with their startup and go back to school? and what were the rea...,Going back to school is not identical with giving up. Some founders go back to school and keep working on the startup while ...,0
1,How many summer Y Combinator fundees decided not to continue with their startup and go back to school? and what were the rea...,"There will invariably be those who don't see the success they set out for, and they fall back to their original path. That's...",1
2,How many summer Y Combinator fundees decided not to continue with their startup and go back to school? and what were the rea...,"For me school is a way to be connected to what is going on in the ""real world"". I entered school thinking it is EITHER schoo...",2
3,How many summer Y Combinator fundees decided not to continue with their startup and go back to school? and what were the rea...,I guess it really depends on how hungry you are and how much you believe in your product. I'm only 24 and still in school as...,3
4,How many summer Y Combinator fundees decided not to continue with their startup and go back to school? and what were the rea...,I know pollground decided to go back to school after getting y combinator funding,4
5,CBS acquires last.fm for $280m,"It will be curious to see where this heads in the long run. CBS is on a tear but will it fit their image, will they try and...",0
6,CBS acquires last.fm for $280m,Does this mean that there's now a big-name company who will fight for the repeal of the recent streaming-music royalty hike?,1
7,CBS acquires last.fm for $280m,Also on BBC News: http://news.bbc.co.uk/1/low/technology/6701863.stm .Nice to see a London-based co. hit the headlines.,2
8,CBS acquires last.fm for $280m,I don't understand what they do that is worth $70M a year.,3
9,CBS acquires last.fm for $280m,"sold out too cheaply. given their leadership position, they should have ask for at least $500m",4


In [8]:
def process_text(text, stop_words, lemmatizer):
    """
    Обработка текста:
        удаление служебных символов и стоп-слов
        приведение к нижнему регистру
        лемматизация
    """
    text = re.sub(re.compile('<.*?>'), '', text)
    text =  re.sub('[^A-Za-z0-9]+', ' ', text)

    text = text.lower()

    tokens = nltk.word_tokenize(text)

    text = [word for word in tokens if word not in stop_words]

    text = [lemmatizer.lemmatize(word) for word in text]

    text = ' '.join(text)

    return text

In [9]:
# Lemmatization preprocessing of text

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

df_train['prp_text'] = df_train['text'].apply(lambda text: process_text(text, stop_words, lemmatizer))
df_train['prp_com'] = df_train['comment'].apply(lambda text: process_text(text, stop_words, lemmatizer))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alex_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alex_\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alex_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df_train['text_id'] = df_train.groupby('prp_text').ngroup()
df_train.sort_values('text_id', inplace=True)

# clean empty texts from dataset
df_train = df_train[df_train['prp_text'] != '']

df_train = df_train.reset_index().drop('index', axis=1)[['text_id', 'text', 'comment', 'prp_text', 'prp_com', 'score']]


In [11]:
df_train['text_id'].value_counts() # we see more than 5 comments to one text

35970    85
31279    85
36297    55
5618     55
9913     45
         ..
29147     5
29146     5
29145     5
29144     5
86926     5
Name: text_id, Length: 86926, dtype: int64

In [12]:
# Delete texts with more than 5 comments
temp = df_train['text_id'].value_counts()
df_train = df_train[~df_train['text_id'].isin(temp[temp > 5].index)]

In [19]:
# Rebuilding indexes
df_train['text_id'] = df_train.groupby('prp_text').ngroup()
df_train.sort_values('text_id', inplace=True)
df_train = df_train.reset_index().drop('index', axis=1)

In [20]:
df_train.head(11)

Unnamed: 0,text_id,text,comment,prp_text,prp_com,score,text_words_qty,comment_words_qty,repeat_words,repeat_rate_words
0,0,0^0,"- &quot;0^0. Why? Because mathematicians said so. No really, it’s true.&quot;- [Detailed explanation of the tradeoffs involv...",0 0,quot 0 0 mathematician said really true quot detailed explanation tradeoff involved choosing different definition exponentia...,3,2,35,1,0.028571
1,0,0^0,It&#x27;s very important to note here that 0^0=1 is a shorthand and not a truth.Mathematicians are absolutely not stating th...,0 0,x27 important note 0 0 1 shorthand truth mathematician absolutely stating proven true 0 0 1 definition claim equality x27 sa...,0,2,174,1,0.005747
2,0,0^0,A word from Knuth on the matter (warning: PDF):http:&#x2F;&#x2F;arxiv.org&#x2F;pdf&#x2F;math&#x2F;9205211v1.pdfSee page 6.,0 0,word knuth matter warning pdf http x2f x2f arxiv org x2f pdf x2f math x2f 9205211v1 pdfsee page 6,4,2,19,0,0.0
3,0,0^0,Students: Let&#x27;s come up with some crazy proofs based on our individual levels of understanding.Teachers: Let&#x27;s do ...,0 0,student let x27 come crazy proof based individual level understanding teacher let x27 book come somehow conflicting answer m...,1,2,28,0,0.0
4,0,0^0,The real problem here is that x^y is a single shorthand which refers to a few fundamentally different mathematical concepts ...,0 0,real problem x single shorthand refers fundamentally different mathematical concept happen significant overlap first refers ...,2,2,83,1,0.012048
5,1,The $0.001 DIY iPhone 4 Antenna Fix,"Is this a joke that I'm not getting? Scotch tape? Really? ""This problem isn't real, but here's how you fix it.""He also compl...",0 001 diy iphone 4 antenna fix,joke getting scotch tape really problem real fix also completely dismisses public perception everyone forget toyota mess qui...,0,7,23,1,0.043478
6,1,The $0.001 DIY iPhone 4 Antenna Fix,"Could you imagine buying a brand new Mercedes CL550, finding out that if you grip the steering wheel with your left hand it ...",0 001 diy iphone 4 antenna fix,could imagine buying brand new mercedes cl550 finding grip steering wheel left hand shut car mercedes insist problem buy ste...,1,7,84,1,0.011905
7,1,The $0.001 DIY iPhone 4 Antenna Fix,"Am I the only one confused by the combination of ""this happens to every phone"" with fixes, including both this and the offic...",0 001 diy iphone 4 antenna fix,one confused combination happens every phone fix including official one workarounds prevent shorting exposed antenna phone,2,7,16,2,0.125
8,1,The $0.001 DIY iPhone 4 Antenna Fix,"Like my idol, Léon Theremin, I will use this ""defect"" to develop a special app that allows the user to play music by strokin...",0 001 diy iphone 4 antenna fix,like idol l theremin use defect develop special app allows user play music stroking caressing iphone various sensitive radio...,3,7,26,1,0.038462
9,1,The $0.001 DIY iPhone 4 Antenna Fix,"I've always held my 2G at the top of the phone, primarily to avoid attenuating the wi-fi noticeably, but also to reduce the ...",0 001 diy iphone 4 antenna fix,always held 2g top phone primarily avoid attenuating wi fi noticeably also reduce energy absorbed hand carrying iphone 4 0 a...,4,7,40,3,0.075


### Features

In [21]:
df_train['text_words_qty'] = (df_train['prp_text'].str.count(' ') + 1)
df_train['comment_words_qty'] = (df_train['prp_com'].str.count(' ') + 1)

In [22]:
def common_words_qty(text, comment):
    
    # Number of words repeated in text and in comment

    text_words = text.split(' ')
    comment_words = comment.split(' ')

    return len(list(set(text_words) & set(comment_words)))

In [23]:
# Number of words repeated in text and in comment
df_train['repeat_words'] = df_train.apply(lambda x: common_words_qty(x['prp_text'], x['prp_com']), axis=1)
df_train['repeat_words'] = df_train['repeat_words'].astype('uint8')

# Rate of number of repeated words to the length of comment 
df_train['repeat_rate_words'] = df_train['repeat_words'] / df_train['comment_words_qty']
df_train['repeat_rate_words'] = df_train['repeat_rate_words'].astype('float32')

In [24]:
# Make a list of all words and count a number of usage

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    
    return text

def get_vocab(df):
    count_vectorizer = CountVectorizer(
        ngram_range=(1, 1),
        stop_words='english',
        preprocessor=preprocess_text,
    )

    kf = KFold(n_splits=10, shuffle=False)

    for i, (_, test_index) in enumerate(kf.split(df)):

        count_data = count_vectorizer.fit_transform(df.loc[test_index]['prp_com'])

        temp = pd.DataFrame(count_data.toarray(),columns=count_vectorizer.get_feature_names_out()).sum().sort_values(ascending=False)

        temp = temp[temp>100]

        temp.name = i

        if i == 0:
            words_rate = pd.DataFrame(temp)
        else:
            words_rate = pd.merge(words_rate, temp, left_index=True, right_index=True, how='outer').fillna(0)
            
    return words_rate.sum(axis=1).sort_values(ascending=False).astype('int')

In [25]:
words_rate = get_vocab(df_train)

In [26]:
# Adding features of link inside comment and if there was quotation in comment

df_train['link'] = 0

df_train['link'] = np.where(df_train['prp_com'].str.contains('http'), 1, 0)

df_train['quoted'] = 0

df_train['quoted'] = np.where(df_train['prp_com'].str.contains('quot'), 1, 0)

In [27]:
# Delete garbage words in our list
words_rate = words_rate.drop(['xf', 'http', 'quot', 'com', 'www'])

In [28]:
df_train['wr_sum'] = df_train['prp_com'].apply(lambda x: words_rate[words_rate.index.isin(x.split(' '))].sum())
df_train['wr_len'] = df_train['prp_com'].apply(lambda x: len(words_rate[words_rate.index.isin(x.split(' '))]))

df_train['wr_rate'] = df_train['wr_sum'] / df_train['wr_len']
df_train['wr_rate_tot'] = df_train['wr_sum'] / df_train['comment_words_qty']

In [29]:
df_train.head(11)

Unnamed: 0,text_id,text,comment,prp_text,prp_com,score,text_words_qty,comment_words_qty,repeat_words,repeat_rate_words,link,quoted,wr_sum,wr_len,wr_rate,wr_rate_tot
0,0,0^0,"- &quot;0^0. Why? Because mathematicians said so. No really, it’s true.&quot;- [Detailed explanation of the tradeoffs involv...",0 0,quot 0 0 mathematician said really true quot detailed explanation tradeoff involved choosing different definition exponentia...,3,2,35,1,0.028571,0,1,292342,17,17196.588235,8352.628571
1,0,0^0,It&#x27;s very important to note here that 0^0=1 is a shorthand and not a truth.Mathematicians are absolutely not stating th...,0 0,x27 important note 0 0 1 shorthand truth mathematician absolutely stating proven true 0 0 1 definition claim equality x27 sa...,0,2,174,1,0.005747,0,1,1005446,64,15710.09375,5778.425287
2,0,0^0,A word from Knuth on the matter (warning: PDF):http:&#x2F;&#x2F;arxiv.org&#x2F;pdf&#x2F;math&#x2F;9205211v1.pdfSee page 6.,0 0,word knuth matter warning pdf http x2f x2f arxiv org x2f pdf x2f math x2f 9205211v1 pdfsee page 6,4,2,19,0,0.0,1,0,94678,7,13525.428571,4983.052632
3,0,0^0,Students: Let&#x27;s come up with some crazy proofs based on our individual levels of understanding.Teachers: Let&#x27;s do ...,0 0,student let x27 come crazy proof based individual level understanding teacher let x27 book come somehow conflicting answer m...,1,2,28,0,0.0,0,0,270892,19,14257.473684,9674.714286
4,0,0^0,The real problem here is that x^y is a single shorthand which refers to a few fundamentally different mathematical concepts ...,0 0,real problem x single shorthand refers fundamentally different mathematical concept happen significant overlap first refers ...,2,2,83,1,0.012048,0,0,345465,27,12795.0,4162.228916
5,1,The $0.001 DIY iPhone 4 Antenna Fix,"Is this a joke that I'm not getting? Scotch tape? Really? ""This problem isn't real, but here's how you fix it.""He also compl...",0 001 diy iphone 4 antenna fix,joke getting scotch tape really problem real fix also completely dismisses public perception everyone forget toyota mess qui...,0,7,23,1,0.043478,0,0,247597,14,17685.5,10765.086957
6,1,The $0.001 DIY iPhone 4 Antenna Fix,"Could you imagine buying a brand new Mercedes CL550, finding out that if you grip the steering wheel with your left hand it ...",0 001 diy iphone 4 antenna fix,could imagine buying brand new mercedes cl550 finding grip steering wheel left hand shut car mercedes insist problem buy ste...,1,7,84,1,0.011905,0,0,716136,43,16654.325581,8525.428571
7,1,The $0.001 DIY iPhone 4 Antenna Fix,"Am I the only one confused by the combination of ""this happens to every phone"" with fixes, including both this and the offic...",0 001 diy iphone 4 antenna fix,one confused combination happens every phone fix including official one workarounds prevent shorting exposed antenna phone,2,7,16,2,0.125,0,0,44388,9,4932.0,2774.25
8,1,The $0.001 DIY iPhone 4 Antenna Fix,"Like my idol, Léon Theremin, I will use this ""defect"" to develop a special app that allows the user to play music by strokin...",0 001 diy iphone 4 antenna fix,like idol l theremin use defect develop special app allows user play music stroking caressing iphone various sensitive radio...,3,7,26,1,0.038462,0,0,351711,15,23447.4,13527.346154
9,1,The $0.001 DIY iPhone 4 Antenna Fix,"I've always held my 2G at the top of the phone, primarily to avoid attenuating the wi-fi noticeably, but also to reduce the ...",0 001 diy iphone 4 antenna fix,always held 2g top phone primarily avoid attenuating wi fi noticeably also reduce energy absorbed hand carrying iphone 4 0 a...,4,7,40,3,0.075,0,0,361734,25,14469.36,9043.35


In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429935 entries, 0 to 429934
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   text_id            429935 non-null  int64  
 1   text               429935 non-null  object 
 2   comment            429935 non-null  object 
 3   prp_text           429935 non-null  object 
 4   prp_com            429935 non-null  object 
 5   score              429935 non-null  int64  
 6   text_words_qty     429935 non-null  int64  
 7   comment_words_qty  429935 non-null  int64  
 8   repeat_words       429935 non-null  uint8  
 9   repeat_rate_words  429935 non-null  float32
 10  link               429935 non-null  int32  
 11  quoted             429935 non-null  int32  
 12  wr_sum             429935 non-null  int64  
 13  wr_len             429935 non-null  int64  
 14  wr_rate            428416 non-null  float64
 15  wr_rate_tot        429935 non-null  float64
dtypes:

In [32]:
import klib
df_train = klib.convert_datatypes(df_train)

In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429935 entries, 0 to 429934
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   text_id            429935 non-null  int32  
 1   text               429935 non-null  string 
 2   comment            429935 non-null  string 
 3   prp_text           429935 non-null  string 
 4   prp_com            429935 non-null  string 
 5   score              429935 non-null  int8   
 6   text_words_qty     429935 non-null  int8   
 7   comment_words_qty  429935 non-null  int16  
 8   repeat_words       429935 non-null  uint8  
 9   repeat_rate_words  429935 non-null  float32
 10  link               429935 non-null  int32  
 11  quoted             429935 non-null  int32  
 12  wr_sum             429935 non-null  int32  
 13  wr_len             429935 non-null  int16  
 14  wr_rate            428416 non-null  float32
 15  wr_rate_tot        429935 non-null  float32
dtypes:

In [34]:
df_train.to_pickle("data/prp_df.pkl")