In [2]:
!pip install pyspellchecker



In [18]:
import pandas as pd
import numpy as np
import string 
import json
import re
from spellchecker import SpellChecker
from nltk.corpus import stopwords as nltk_stopwords

In [19]:
posts = []
for line in open("ranking_train.jsonl", 'r'):
    posts.append(json.loads(line))
    
all_comments = []
for i in range(len(posts)):
    all_comments.append(pd.DataFrame.from_dict(posts[i], orient='columns'))
    
df = pd.concat(all_comments, ignore_index=True)
df.sample(10)

Unnamed: 0,text,comments
406967,‘Rice Theory’ Explains North-South China Cultu...,{'text': 'about the same type of difference is...
72053,"Piet is a programming language, whose programs...",{'text': 'This is really cool. Lots of possib...
121893,I solved the embedded Chrome OS ad equation an...,{'text': 'From the article:&#62; most people I...
244275,JQuery 1.8 Released,"{'text': 'Hey dmethvin,You guys are doing awes..."
64601,"UCSF Endocrinologist debunks ""Calories In / Ca...",{'text': 'I saw this literally after just fini...
163592,CSS3 Isometric text,{'text': 'I like it how the blue text selectio...
377059,TrueCrypt's Plausible Deniability is Theoretic...,{'text': 'I think the author made a mistake. ...
77898,"Ask HN: Review my project, The Bus Ride",{'text': 'I like the idea and the design is gr...
310175,The future of creative work online - Welcome t...,{'text': 'The founders may want to drink their...
366512,Was Cantor Wrong? Are the real numbers countable?,{'text': 'In your set of positive even numbers...


In [20]:
# оценка комментария (целевой признак)
df['score'] = df['comments'].map(lambda a: a['score'])
# Достать текст из словаря
df['comments'] = df['comments'].map(lambda a: a['text'])

In [21]:
# количество слов в комментарии
df['words_num'] = df['comments'].apply(lambda x: len(x.split()))

In [22]:
def clear_cods(s):
    s = re.sub("&#\S+;", "\'", s) # убираем html кодировки символов
    s = re.sub(r"\xa0", " ", s) # убираем html кодировки символов
    return s

In [23]:
# количество орфографических ошибок
def clear_text(s):
    s = re.sub('http\S+', '', s) # убираем ссылки
    s = re.sub('\S+.com', '', s) # убираем ссылки
    s = re.sub("&#\S+;", "\'", s) # убираем html кодировки символов
    s = re.sub(r"\xa0", " ", s) # убираем html кодировки символов
    s = re.sub("[^A-Za-z \']", " ", s) # оставляем только слова
    return s.split()

spell = SpellChecker()
df['mistakes_num'] = df['comments'].apply(lambda x: len(spell.unknown(clear_text(x))))

In [24]:
# кол-во символов
df['letters_num'] = df['comments'].apply(lambda x: len(clear_cods(x)))

In [25]:
# кол-во стоп-слов и процент стоп-слов
stopwords = set(nltk_stopwords.words('english'))
df['stopwords_num'] = df['comments'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
df['stopwords_percent_num'] = df['stopwords_num'] * 100 / df['words_num']

In [26]:
# кол-во слов капсом
df["upper_words_num"] = df['comments'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

In [27]:
# кол-во уникальных слов и процент уникальных слов
df['unique_num'] = df['comments'].apply(lambda x: len(set(str(x).split())))
df['unique_percent_num'] = df['unique_num'] * 100 / df['words_num']

In [28]:
# средняя длина слова в комментарии
df["words_mean_len"] = df['comments'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [29]:
# кол-во знаков препинания
df["punct_num"] = df['comments'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [30]:
# Кол-во слов с заглавной буквы
df["first_capital_num"] = df['comments'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

In [31]:
# кол-во ссылок в комментарии
df['url_num'] = df['comments'].apply(lambda x: len(re.findall(r'http\S+', x)))

# Добавление оценки позитивности комментария

In [32]:
positive = pd.read_csv('pos_neg_features', index_col=0)

In [33]:
df['positive'] = positive['positive']

# Добавление косинусного расстояния между постом и комментарием

In [34]:
cos_sim = []
with open('cos_sim.txt', 'r') as f:
    for i in f:
        cos_sim.append(float(i.replace('\n', '')))
cos_sim = np.array(cos_sim)

In [35]:
df['cos_sim'] = cos_sim

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440535 entries, 0 to 440534
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   text                   440535 non-null  object 
 1   comments               440535 non-null  object 
 2   score                  440535 non-null  int64  
 3   words_num              440535 non-null  int64  
 4   mistakes_num           440535 non-null  int64  
 5   letters_num            440535 non-null  int64  
 6   stopwords_num          440535 non-null  int64  
 7   stopwords_percent_num  440535 non-null  float64
 8   upper_words_num        440535 non-null  int64  
 9   unique_num             440535 non-null  int64  
 10  unique_percent_num     440535 non-null  float64
 11  words_mean_len         440535 non-null  float64
 12  punct_num              440535 non-null  int64  
 13  first_capital_num      440535 non-null  int64  
 14  url_num                440535 non-nu

In [39]:
df.to_csv('dataset_train.csv', index=False)