In [None]:
import tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval
import itertools

import stanza
import spacy_stanza
stanza.download("ru")

nlp = spacy_stanza.load_pipeline("ru")

from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel


In [None]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('russian'))
nltk.download('wordnet')
nltk.download('stopwords');


In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

# Создание стартовых признаков

In [None]:
train_df = pd.read_csv("full_train.csv", index_col=0)
test_df = pd.read_csv("full_test.csv", index_col=0)

In [None]:
cat_cols = train_df[["title","category","authors", "tags", "publish_date"]].append(test_df[["title","category","authors", "tags", "publish_date"]])

In [None]:
cat_cols["category"] = cat_cols["category"].astype('category')
cat_cols["category"] = cat_cols["category"].cat.codes
cat_cols["category"] = cat_cols["category"].astype('int')
cat_cols["authors"] = cat_cols["authors"].astype('category')
cat_cols["authors"] = cat_cols["authors"].cat.codes
cat_cols["authors"] = cat_cols["authors"].astype('int')
cat_cols["tags"] = cat_cols["tags"].astype('category')
cat_cols["tags"] = cat_cols["tags"].cat.codes
cat_cols["tags"] = cat_cols["tags"].astype('int')
cat_cols['day'] = pd.to_datetime(cat_cols['publish_date']).dt.strftime("%d").astype(int)
cat_cols['mounth'] = pd.to_datetime(cat_cols['publish_date']).dt.strftime("%m").astype(int)
cat_cols['hour'] = pd.to_datetime(cat_cols['publish_date']).dt.strftime("%H").astype(int)
cat_cols['minute'] = pd.to_datetime(cat_cols['publish_date']).dt.strftime("%M").astype(int)


## Начало и конец месяца

In [None]:
start = []
end = []
for i, day in enumerate(cat_cols["day"]):
    #print(day)
    if day < 5:
        start.append(1.0)
    else:
        start.append(0.0)
    if day > 27:
        end.append(1.0)
    else:
        end.append(0.0)

In [None]:
cat_cols["month_end"] = end
cat_cols["month_start"] = start

## Время публикации новости: прайм тайм (после 19:00) и начало дня(до 11:00)

In [None]:
prime = []
morning = []
for i, hour in enumerate(cat_cols["hour"]):
    #print(day)
    if hour > 19:
        prime.append(1.0)
    else:
        prime.append(0.0)
    if hour < 11:
        morning.append(1.0)
    else:
        morning.append(0.0)

In [None]:
cat_cols["prime_time"] = prime
cat_cols["morning"] = morning

## Лемматизация

In [None]:
list_corpus = cat_cols["title"].to_list()

In [None]:
import string,re
from tqdm import tqdm
def text_cleaner(list_corpus):
    results = []
    for sentence in tqdm(list_corpus):
        #regex = re.compile('\s+[\w]+,\s[\d]+.[\d]+') \s{5}.+
        regex = re.compile('\s{5}.+')
        sentence = regex.sub('', sentence).translate(string.punctuation)
        results.append(sentence)
    return results
clean_list_corpus = text_cleaner(list_corpus)

In [None]:
def lemmatize(text):
    words = text.split() # разбиваем текст на слова
    res = list()
    for word in words:
        if (word not in stopwords and len(word) > 1):
            p = morph.parse(word)[0]
            res.append(p.normal_form)
    text = " ".join(res)
    return text

In [None]:
lemmatized_corpus = [lemmatize(text) for text in clean_list_corpus]

In [None]:
lem_df = pd.DataFrame({"lem_title": lemmatized_corpus}, index=cat_cols.index)

In [None]:
cat_cols = pd.merge(cat_cols, lem_df, left_index=True, right_index=True)

# Сохранение данных

In [None]:
train_df = train_df.drop(["title","authors","category","authors", "tags", "publish_date"], axis=1)
test_df = test_df.drop(["title","authors","category","authors", "tags", "publish_date"], axis=1)

In [None]:
new_train_df = pd.merge(train_df, cat_cols, left_index=True, right_index=True)
new_test_df = pd.merge(test_df, cat_cols, left_index=True, right_index=True)

In [None]:
new_test_df.to_csv("new_test.csv")
new_train_df.to_csv("new_train.csv")

### Кодирование текстовых признаков

In [None]:
train_df = pd.read_csv("new_train.csv", index_col=0)
test_df = pd.read_csv("new_test.csv", index_col=0)

In [None]:
train_df_to_append = pd.read_csv("new_train.csv", index_col=0)
test_df_to_append = pd.read_csv("new_test.csv", index_col=0)

In [None]:
train_df = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("test_dataset_test.csv", index_col=0)

# Векторизация текста статьи с помощью doc2vec

In [None]:
corpus_lem = train_df[["article","session"]].append(test_df[["article","session"]])

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
corpus = corpus_lem["article"].to_list()

In [None]:
import re
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)


In [None]:
len(cleaned_corpus)

In [None]:
from tqdm import tqdm
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(sent)
    train_set.append(nltk_tokens)

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_set)]
model = Doc2Vec(documents, vector_size=300, window=4, min_count=1, workers=4)

In [None]:
model.build_vocab(documents)

In [None]:
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.init_sims(replace=True)

In [None]:
transformed_corpus = []
for i, sents in tqdm(enumerate(train_set)):
    transformed_corpus.append(model.infer_vector(sents))

In [None]:
embed_cols = [str(i) for i in list(np.arange(0,300))]
emb_names=["doc2vec_article_"+str(i) for i in embed_cols]

doc2vec_encoded = pd.DataFrame(transformed_corpus, columns=emb_names)
doc2vec_encoded.index = corpus_lem.index

In [None]:
train_df_to_append = pd.merge(train_df_to_append, doc2vec_encoded, left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, doc2vec_encoded, left_index=True, right_index=True)

## Анализ тональности текста статьи

In [None]:
corpus = train_df["article"].append(test_df["article"])
list_corpus = corpus.values.tolist()

In [None]:
cleaned_corpus = []
for i, sentence in enumerate(list_corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)

In [None]:
def sensitive_analysis(cleaned_corpus):
    tokenizer = RegexTokenizer()
    model = FastTextSocialNetworkModel(tokenizer=tokenizer)
    results = model.predict(cleaned_corpus, k=5)
    return results
s_a = sensitive_analysis(cleaned_corpus)

In [None]:
after_s_an = pd.DataFrame(s_a, index=corpus.index)
after_s_an

In [None]:
train_df_to_append = pd.merge(train_df_to_append, after_s_an, left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, after_s_an, left_index=True, right_index=True)

# Кодирование признака keyfeatures

In [None]:
corpus_lem = train_df[["keyfeatures","session"]].append(test_df[["keyfeatures","session"]])

In [None]:
corpus = corpus_lem["keyfeatures"].fillna("").tolist()

In [None]:
import re
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    text= text.lower()
    cleaned_corpus.append(text)

In [None]:
cleaned_corpus

In [None]:
kf_vect = CountVectorizer(max_features=500)
kf_transformed = kf_vect.fit_transform(cleaned_corpus).todense()

In [None]:
names = kf_vect.get_feature_names()

In [None]:
df = pd.DataFrame(kf_transformed)
df.columns = kf_vect.get_feature_names()
df.index = corpus_lem.index

In [None]:
train_df_to_append = pd.merge(train_df_to_append, df, left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, df, left_index=True, right_index=True)

# Кодирование авторов с помощью метода CountVectorizer

In [None]:
corpus_lem = train_df[["new_authors","session"]].append(test_df[["new_authors","session"]])

In [None]:
corpus = corpus_lem["new_authors"].fillna("").tolist()

In [None]:
import re
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    text= text.lower()
    cleaned_corpus.append(text)

In [None]:
cleaned_corpus

In [None]:
new_corpus = []
from tqdm import tqdm
for i, elem in tqdm(enumerate(cleaned_corpus)):
    if len(elem) ==3:
        #print("rbk")
        new_corpus.append(elem)
    else:
        elem_tokens = nltk.word_tokenize(elem)
        #print(elem_tokens)
        name = ""
        surname = ""
        if len(elem_tokens) > 2:
            new_authors = ""
            for j, token in enumerate(elem_tokens):
                if j%2==0:
                    name = token
                elif j%2!=0:
                    surname = token
                if name !="" and surname!="":
                    new_authors+=name+"_"+surname+" "
                    name = ""
                    surname = ""
            new_corpus.append(new_authors)
        else:
            new_corpus.append(elem_tokens[0]+"_"+elem_tokens[1])

In [None]:
new_corpus

In [None]:
af_vect = CountVectorizer(binary=True,analyzer='word')
af_transformed = af_vect.fit_transform(new_corpus).todense()

In [None]:
df = pd.DataFrame(af_transformed)
df.columns = af_vect.get_feature_names()
df.index = corpus_lem.index

In [None]:
names= af_vect.get_feature_names()

In [None]:
train_df_to_append = pd.merge(train_df_to_append, df, left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, df, left_index=True, right_index=True)

## Дополнительные признаки

### Train_df

In [None]:
from tqdm import tqdm

for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'article_len'] = int(len(row.article))
lem_len = []

for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'title_len'] = int(len(row.lem_title)) 
    
for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'keyfeatures_len'] = int(len(str(row.keyfeatures))) 

import re
corpus = train_df_to_append.article.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(sent)
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'article_word_count'] = int(len(train_set[it]))
    it+=1
    
corpus = train_df_to_append.lem_title.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(sent)
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'title_word_count'] = int(len(train_set[it]))
    it+=1
    
corpus = train_df_to_append.keyfeatures.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', str(sentence))
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(str(sent))
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(train_df_to_append.iterrows()):
    train_df_to_append.loc[index, 'keyfeatures_word_count'] = int(len(train_set[it]))
    it+=1

### Test_df

In [None]:
from tqdm import tqdm

for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'article_len'] = int(len(row.article))
lem_len = []

for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'title_len'] = int(len(row.lem_title)) 
    
for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'keyfeatures_len'] = int(len(str(row.keyfeatures))) 

import re
corpus = test_df_to_append.article.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(sent)
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'article_word_count'] = int(len(train_set[it]))
    it+=1
    
corpus = test_df_to_append.lem_title.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(sent)
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'title_word_count'] = int(len(train_set[it]))
    it+=1
    
corpus = test_df_to_append.keyfeatures.tolist()
cleaned_corpus = []
for i, sentence in enumerate(corpus):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', str(sentence))
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«,.»$]", " ", rem_num)
    text=re.sub(' +', ' ', text)
    cleaned_corpus.append(text)
train_set = []
for i, sent in tqdm(enumerate(cleaned_corpus)):
    #print(type(sent),i)
    nltk_tokens = nltk.word_tokenize(str(sent))
    train_set.append(nltk_tokens)
it = 0
for index, row in tqdm(test_df_to_append.iterrows()):
    test_df_to_append.loc[index, 'keyfeatures_word_count'] = int(len(train_set[it]))
    it+=1

In [None]:
train_df_to_append.to_csv("cleaned_train.csv")
test_df_to_append.to_csv("cleaned_test.csv")