In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, coo_matrix, hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron

import nltk
#nltk.download()

train_df = pd.read_csv('train.csv')
#train_df.head()
print(train_df)

            id                        url  \
0            0                    m.kp.md   
1            1                  www.kp.by   
2            2              fanserials.tv   
3            3            colorbox.spb.ru   
4            4              tula-sport.ru   
...        ...                        ...   
135304  135304                    mail.ru   
135305  135305                 www.ntv.ru   
135306  135306  topclassiccarsforsale.com   
135307  135307                wowcream.ru   
135308  135308                 www.ubu.ru   

                                                    title  target  
0       Экс-министр экономики Молдовы - главе МИДЭИ, ц...   False  
1       Эта песня стала известна многим телезрителям б...   False  
2       Банши 4 сезон 2 серия Бремя красоты смотреть о...   False  
3                                   Не Беси Меня Картинки   False  
4       В Новомосковске сыграют следж-хоккеисты алекси...   False  
...                                                

In [2]:
validate_df = pd.read_csv("test.csv")
validate_df

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67
...,...,...,...
165373,300682,etp.armtek.ru,Armtek - запчасти для грузовых и легковых авто...
165374,300683,mail.ru,"Лилия Якупова - Караганда, Карагандинская обла..."
165375,300684,xn----8sbnqchpeeeth.xn--p1ai,Администрация Лесного района Тверской области ...
165376,300685,www-sunhome-ru.cdn.ampproject.org,Сонник Изменение сознания. К чему снится Измен...


In [3]:
train_df['target'].value_counts()

False    118594
True      16715
Name: target, dtype: int64

In [4]:
# Поглядим, какие есть уникальные url
unique_urls = train_df['url'].value_counts()
print(unique_urls)

mail.ru                  7900
baza.drom.ru             1042
www.olx.ua                945
irecommend.ru             767
daftsex.com               741
                         ... 
kurgan7.ru                  1
www.kirov.spb.ru            1
fixico.com.ua               1
stirka.net                  1
ahmedabad.wedding.net       1
Name: url, Length: 40391, dtype: int64


In [5]:
# Для каждого рассчитываем процент положительных и отрицательных

# freq_urls = unique_urls[unique_urls > 10]
# freq_urls = unique_urls
# urls = []
# pos_ratios = []
# neg_ratios = []
# nums = []
# for url_i in freq_urls.index:
#     idxs = train_df['url'] == url_i
#     targets = train_df[idxs]['target']
#     n_true = targets[targets == 1].size
#     urls.append(url_i)
#     pos_ratios.append(n_true/targets.size)
#     neg_ratios.append(1 - n_true/targets.size)
#     nums.append(targets.size)

# url_info_df = pd.DataFrame({'url' : urls, 'pos_ratio' : pos_ratios, 'neg_ratio' : neg_ratios, 'num' : nums})
url_info_df = pd.read_csv('url_info.csv')

# А теперь вводим метрики, учитывающие количество появлений url
# Для этого будем умножать на сигмоиды
#pos_sigm = url_info_df['pos_ratio'] * 1/(1 + 12*np.exp(-(url_info_df['num'] - 2.5)))
#neg_sigm = url_info_df['neg_ratio'] * 1/(1 + 12*np.exp(-(url_info_df['num'] - 2.5)))

pos_sigm = url_info_df['pos_ratio'] * (0.5 + np.arctan(0.4 * (url_info_df['num'] - 10.)) / np.pi)
neg_sigm = url_info_df['neg_ratio'] * (0.5 + np.arctan(0.4 * (url_info_df['num'] - 10.)) / np.pi)

url_info_df['pos_sigm'] = pos_sigm
url_info_df['neg_sigm'] = neg_sigm
url_info_df.to_csv('url_info.csv', index=False)


In [6]:
# Добавляем эти сигмоидные метрики в датафреймы
train_df_m = pd.merge(train_df, url_info_df, how='left',on=['url'])
validate_df_m = pd.merge(validate_df, url_info_df, how='left',on=['url'])
# Если такого url в test не было в train -- просто обнуляем
validate_df_m = validate_df_m.fillna(value=0.)
validate_df_m.head()


Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...,0.0,1.0,10.0,0.0,0.5
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ...",0.0,1.0,6.0,0.0,0.177808
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12,0.0,0.0,0.0,0.0,0.0
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ...",1.0,0.0,1.0,0.086245,0.0
4,135313,2gis.ru,67,0.0,1.0,75.0,0.0,0.987763


In [7]:
# Теперь непосредственно обрабатываем тайтлы
# Всё в нижни регистр
train_df_m['title'] = train_df_m['title'].str.lower()
validate_df_m['title'] = validate_df_m['title'].str.lower()
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,135309,www.kommersant.ru,шестой кассационный суд в самаре начнет работу...,0.0,1.0,10.0,0.0,0.5
1,135310,urexpert.online,"что такое индексация алиментов, кем и в каких ...",0.0,1.0,6.0,0.0,0.177808
2,135311,imperimeha.ru,женщинам | империя меха - part 12,0.0,0.0,0.0,0.0,0.0
3,135312,national-porn.com,"небритые, волосатые киски: порно всех стран и ...",1.0,0.0,1.0,0.086245,0.0
4,135313,2gis.ru,67,0.0,1.0,75.0,0.0,0.987763


In [8]:
# Удаляем всё, что не буквы
train_df_m['title'] = train_df_m['title'].str.replace('[^a-zA-Zа-яА-Я ]', ' ')
validate_df_m['title'] = validate_df_m['title'].str.replace('[^a-zA-Zа-яА-Я ]', ' ')
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,135309,www.kommersant.ru,шестой кассационный суд в самаре начнет работу...,0.0,1.0,10.0,0.0,0.5
1,135310,urexpert.online,что такое индексация алиментов кем и в каких ...,0.0,1.0,6.0,0.0,0.177808
2,135311,imperimeha.ru,женщинам империя меха part,0.0,0.0,0.0,0.0,0.0
3,135312,national-porn.com,небритые волосатые киски порно всех стран и ...,1.0,0.0,1.0,0.086245,0.0
4,135313,2gis.ru,,0.0,1.0,75.0,0.0,0.987763


In [9]:
import pymorphy2

# Лемматизируем
lemmatizer = pymorphy2.MorphAnalyzer()

lemmatizer_cache = {}
def lem_text(text):
    return ' '.join([lem_token(token) for token in text.split()])

def lem_token(token):
    if lemmatizer.word_is_known(token):
        if token not in lemmatizer_cache:
            lemmatizer_cache[token] = lemmatizer.parse(token)[0].normal_form
        return lemmatizer_cache[token]
    return token

train_df_m['title'] = train_df_m['title'].apply(lem_text)
validate_df_m['title'] = validate_df_m['title'].apply(lem_text)
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,135309,www.kommersant.ru,шестой кассационный суд в самара начать работа...,0.0,1.0,10.0,0.0,0.5
1,135310,urexpert.online,что такой индексация алименты кто и в какой сл...,0.0,1.0,6.0,0.0,0.177808
2,135311,imperimeha.ru,женщина империя мех part,0.0,0.0,0.0,0.0,0.0
3,135312,national-porn.com,небритый волосатый киска порно весь страна и н...,1.0,0.0,1.0,0.086245,0.0
4,135313,2gis.ru,,0.0,1.0,75.0,0.0,0.987763


In [10]:
# Убираем стоп-слова
from nltk.corpus import stopwords

mystopwords = stopwords.words('russian') + stopwords.words('english')

def remove_stopwords(text, mystopwords = mystopwords):
    try:
        return u" ".join([token for token in text.split() if not token in mystopwords])
    except:
        return u""

train_df_m['title'] = train_df_m['title'].apply(remove_stopwords)
validate_df_m['title'] = validate_df_m['title'].apply(remove_stopwords)
#train_df_m.head()
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,135309,www.kommersant.ru,шестой кассационный суд самара начать работа р...,0.0,1.0,10.0,0.0,0.5
1,135310,urexpert.online,индексация алименты случай производиться каков...,0.0,1.0,6.0,0.0,0.177808
2,135311,imperimeha.ru,женщина империя мех part,0.0,0.0,0.0,0.0,0.0
3,135312,national-porn.com,небритый волосатый киска порно весь страна нац...,1.0,0.0,1.0,0.086245,0.0
4,135313,2gis.ru,,0.0,1.0,75.0,0.0,0.987763


In [44]:
# Удаляем строки с малым количеством слов
trsh = 1
train_df_m = train_df_m.loc[train_df_m['title'].str.split().str.len() > trsh]
#validate_df_m = validate_df_m.loc[validate_df_m['title'].str.split().str.len() > 2]
train_df_m

Unnamed: 0,id,url,title,target,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm
0,0,m kp md,экс министр экономика молдова глава мидэи цель...,False,0.000000,1.000000,3,0.000000,0.109188
1,1,www kp by,песня стать известный многий телезритель благо...,False,0.000000,1.000000,7,0.000000,0.221142
2,2,fanserials tv,банши сезон серия бремя красота смотреть онлайн,False,0.000000,1.000000,43,0.000000,0.975932
3,3,colorbox spb ru,бесить картинка,False,0.000000,1.000000,1,0.000000,0.086245
4,4,tula sport ru,новомосковск сыграть следж хоккеист алексински...,False,0.000000,1.000000,2,0.000000,0.096411
...,...,...,...,...,...,...,...,...,...
135304,135304,mail ru,пора тюльпан турецкий сериал русский язык резу...,False,0.016203,0.983797,7900,0.016201,0.983698
135305,135305,www ntv ru,остросюжетный сериал шеф игра повышение серия,False,0.000000,1.000000,44,0.000000,0.976637
135306,135306,topclassiccarsforsale com,plymouth special deluxe hot rod automatic smal...,False,0.000000,1.000000,9,0.000000,0.378881
135307,135307,wowcream ru,купить skin сыворотка питательный power formul...,False,0.000000,1.000000,1,0.000000,0.086245


In [45]:
# Количество строк с короткими тайтлами в валидационной
validate_df_m.loc[validate_df_m['title'].str.split().str.len() < trsh+1].shape

(4183, 8)

In [46]:
# Сделаем tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
# Обучаемся и по валидационной выборке
tfidf_vectorizer.fit(pd.concat([train_df_m['title'], validate_df_m['title']]))
train_titles_tfidf = tfidf_vectorizer.transform(train_df_m['title'])
validate_titles_tfidf = tfidf_vectorizer.transform(validate_df_m['title'])
print(train_titles_tfidf.shape)
print(validate_titles_tfidf.shape)

(131876, 193620)
(165378, 193620)


In [69]:
# Сделаем vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(binary=True)
# Обучаемся и по валидационной выборке
count_vectorizer.fit(pd.concat([train_df_m['title'], validate_df_m['title']]))
train_titles_count = count_vectorizer.transform(train_df_m['title'])
validate_titles_count = count_vectorizer.transform(validate_df_m['title'])
print(train_titles_count.shape)
print(validate_titles_count.shape)

(131876, 193620)
(165378, 193620)


In [127]:
# Просто посмотрим самые часто встречающиеся слова
from scipy.sparse import csr_matrix

count_vectorizer_test = CountVectorizer()
count_vectorizer_test.fit(train_df_m['title'])
word_freq_all = np.squeeze(np.asarray(csr_matrix.sum(count_vectorizer_test.transform(train_df_m['title']), axis=0)))

words_df = pd.DataFrame({'word' : count_vectorizer_test.get_feature_names(),
                         'all frequency' : word_freq_all})

# Посчитаем самые частые слова при положительных и отрицательных
train_df_m_true = train_df_m[train_df_m['target'] == True]
test_sum_true = np.squeeze(np.asarray(csr_matrix.sum(count_vectorizer_test.transform(train_df_m_true['title']), axis=0)))
words_df['true frequency'] = test_sum_true

train_df_m_false = train_df_m[train_df_m['target'] == False]
test_sum_false = np.squeeze(np.asarray(csr_matrix.sum(count_vectorizer_test.transform(train_df_m_false['title']), axis=0)))
words_df['false frequency'] = test_sum_false

# И замутим для них метрики
words_df['pos ratio'] = words_df['true frequency'] / words_df['all frequency']
words_df['neg ratio'] = 1 - words_df['true frequency'] / words_df['all frequency']

#words_df['pos sigm'] = words_df['pos_ratio'] * (0.5 + np.arctan(0.4 * (words_df['all frequency'] - 10.)) / np.pi)
# words_df['neg sigm'] = words_df['neg_ratio'] * (0.5 + np.arctan(0.4 * (words_df['all frequency'] - 10.)) / np.pi)
words_df['pos sigm'] = words_df['pos ratio'] * (np.log(words_df['all frequency']) + 0.5)/5
words_df['neg sigm'] = words_df['neg ratio'] * (np.log(words_df['all frequency']) + 0.5)/5


#words_df = words_df.sort_values(by='all frequency', ascending=False)
words_df = words_df.sort_values(by='pos sigm', ascending=False)

words_df

Unnamed: 0,word,all frequency,true frequency,false frequency,pos ratio,neg ratio,pos sigm,neg sigm
94715,порно,5211,5207,4,0.999232,0.000768,1.810315,0.001391
34645,porn,3781,3780,1,0.999736,0.000264,1.747087,0.000462
40050,sex,2042,2033,9,0.995593,0.004407,1.617178,0.007159
47901,videos,1900,1894,6,0.996842,0.003158,1.604838,0.005084
50006,xxx,1367,1367,0,1.000000,0.000000,1.544075,0.000000
...,...,...,...,...,...,...,...,...
47547,vell,1,0,1,0.000000,1.000000,0.000000,0.100000
47546,velile,1,0,1,0.000000,1.000000,0.000000,0.100000
47545,velikspb,1,0,1,0.000000,1.000000,0.000000,0.100000
47544,velikono,1,0,1,0.000000,1.000000,0.000000,0.100000


In [177]:
# Сделаем эдакий словарь триггеров
bad_words_df = words_df[words_df['pos ratio'] > 0.9].sort_values(by='pos sigm', ascending=False)[:1500]

#bad_words_dict = {bad_words_df['word'] : bad_words_df['pos sigm']}

bad_words_dict = {}
for str_i in bad_words_df[['word', 'pos sigm']].itertuples(index=False):
    bad_words_dict[str_i[0]] = str_i[1]

bad_words_df
#bad_words_dict

Unnamed: 0,word,all frequency,true frequency,false frequency,pos ratio,neg ratio,pos sigm,neg sigm
94715,порно,5211,5207,4,0.999232,0.000768,1.810315,0.001391
34645,porn,3781,3780,1,0.999736,0.000264,1.747087,0.000462
40050,sex,2042,2033,9,0.995593,0.004407,1.617178,0.007159
47901,videos,1900,1894,6,0.996842,0.003158,1.604838,0.005084
50006,xxx,1367,1367,0,1.000000,0.000000,1.544075,0.000000
...,...,...,...,...,...,...,...,...
104716,сосущий,3,3,0,1.000000,0.000000,0.319722,0.000000
30682,nickey,3,3,0,1.000000,0.000000,0.319722,0.000000
23116,kinkyhub,3,3,0,1.000000,0.000000,0.319722,0.000000
116645,шлёпание,3,3,0,1.000000,0.000000,0.319722,0.000000


In [180]:
# Теперь добавим метрику, которая считает в title сумму для словарных слов

def sum_word_sigm(text, bad_words=bad_words_dict):
    ret_sum = 0.
    for token in text.split():
        if token in bad_words:
            ret_sum += bad_words[token]
    return ret_sum

bad_words_dict;

train_df_m['sigm_sum'] = train_df_m['title'].apply(sum_word_sigm)
validate_df_m['sigm_sum'] = validate_df_m['title'].apply(sum_word_sigm)
#train_df_m.head()
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm,sigm_sum
0,135309,www kommersant ru,шестой кассационный суд самара начать работа р...,0.0,1.0,10.0,0.0,0.5,0.0
1,135310,urexpert online,индексация алименты случай производиться каков...,0.0,1.0,6.0,0.0,0.177808,0.0
2,135311,imperimeha ru,женщина империя мех part,0.0,0.0,0.0,0.0,0.0,0.0
3,135312,national porn com,небритый волосатый киска порно весь страна нац...,1.0,0.0,1.0,0.086245,0.0,4.369169
4,135313,gis ru,,0.0,1.0,75.0,0.0,0.987763,0.0


In [181]:
# Попробуем спилить из url всё, что не буквы
train_df_m['url'] = train_df_m['url'].str.replace('[^a-zA-Zа-яА-Я ]', ' ')
validate_df_m['url'] = validate_df_m['url'].str.replace('[^a-zA-Zа-яА-Я ]', ' ')
validate_df_m.head()

Unnamed: 0,id,url,title,pos_ratio,neg_ratio,num,pos_sigm,neg_sigm,sigm_sum
0,135309,www kommersant ru,шестой кассационный суд самара начать работа р...,0.0,1.0,10.0,0.0,0.5,0.0
1,135310,urexpert online,индексация алименты случай производиться каков...,0.0,1.0,6.0,0.0,0.177808,0.0
2,135311,imperimeha ru,женщина империя мех part,0.0,0.0,0.0,0.0,0.0,0.0
3,135312,national porn com,небритый волосатый киска порно весь страна нац...,1.0,0.0,1.0,0.086245,0.0,4.369169
4,135313,gis ru,,0.0,1.0,75.0,0.0,0.987763,0.0


In [182]:
# tf-idf на url

#tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = CountVectorizer(binary=True)

# Обучаемся и по валидационной выборке
tfidf_vectorizer.fit(pd.concat([train_df_m['url'], validate_df_m['url']]))

train_urls_tfidf = tfidf_vectorizer.transform(train_df_m['url'])
validate_urls_tfidf = tfidf_vectorizer.transform(validate_df_m['url'])

print(train_urls_tfidf.shape)
print(validate_urls_tfidf.shape)

(131876, 53607)
(165378, 53607)


In [204]:
# Лепим данные

TEST_SIZE = 0.5

y_full = train_df_m['target']

# Всё вместе
X_full0 = hstack([train_df_m.iloc[:,7:], train_titles_tfidf, train_titles_count,  train_urls_tfidf]).tocsr()
X_validate0 = hstack([validate_df_m.iloc[:,6:], validate_titles_tfidf, validate_titles_count, validate_urls_tfidf]).tocsr()
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_full0, y_full, test_size=TEST_SIZE, random_state=42)

# С tf-idf
#X_full1 = hstack([train_df_m.iloc[:,9:], train_titles_tfidf, train_urls_tfidf]).tocsr()
#X_validate1 = hstack([validate_df_m.iloc[:,8:], validate_titles_tfidf, validate_urls_tfidf]).tocsr()
X_full1 = hstack([train_df_m.iloc[:,10:], train_titles_tfidf]).tocsr()
X_validate1 = hstack([validate_df_m.iloc[:,9:], validate_titles_tfidf]).tocsr()

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_full1, y_full, test_size=TEST_SIZE, random_state=42)

# С count-vectorizer
#X_full2 = hstack([train_df_m.iloc[:,9:], train_titles_count, train_urls_tfidf]).tocsr()
#X_validate2 = hstack([validate_df_m.iloc[:,8:], validate_titles_count, validate_urls_tfidf]).tocsr()
X_full2 = hstack([train_df_m.iloc[:,9:], train_titles_count]).tocsr()
X_validate2 = hstack([validate_df_m.iloc[:,8:], validate_titles_count]).tocsr()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_full2, y_full, test_size=TEST_SIZE, random_state=42)

# url 
X_full3 = hstack([train_df_m.iloc[:,7:9], train_urls_tfidf]).tocsr()
X_validate3 = hstack([validate_df_m.iloc[:,6:8], validate_urls_tfidf]).tocsr()

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_full3, y_full, test_size=TEST_SIZE, random_state=42)

print(X_full2.shape)
print(X_validate2.shape)

(131876, 193621)
(165378, 193621)


# Логистическая регрессия на всём

In [198]:
log_clf0 = LogisticRegression(random_state=42, max_iter=150).fit(X_train0, y_train0)

y_train_log_clf0_pred = log_clf0.predict(X_train0)
y_test_log_clf0_pred = log_clf0.predict(X_test0)
y_full_log_clf0_pred = log_clf0.predict(X_full0)
y_validate_log_clf0_pred = log_clf0.predict(X_validate0)

y_train_log_clf0_pred_prob = log_clf0.predict_proba(X_train0)
y_test_log_clf0_pred_prob = log_clf0.predict_proba(X_test0)
y_full_log_clf0_pred_prob = log_clf0.predict_proba(X_full0)
y_validate_log_clf0_pred_prob = log_clf0.predict_proba(X_validate0)

print('log_clf0 train', f1_score(y_train0, y_train_log_clf0_pred))
print('log_clf0 test ', f1_score(y_test0, y_test_log_clf0_pred))
print('log_clf0 full: ', f1_score(y_full, y_full_log_clf0_pred))

log_clf0 train 0.9941913787832466
log_clf0 test  0.9794431755046609
log_clf0 full:  0.9868526141180807


# Логистическая регрессия на tfidf

In [199]:
log_clf1 = LogisticRegression(random_state=42, max_iter=150).fit(X_train1, y_train1)

y_train_log_clf1_pred = log_clf1.predict(X_train1)
y_test_log_clf1_pred = log_clf1.predict(X_test1)
y_full_log_clf1_pred = log_clf1.predict(X_full1)
y_validate_log_clf1_pred = log_clf1.predict(X_validate1)

y_train_log_clf1_pred_prob = log_clf1.predict_proba(X_train1)
y_test_log_clf1_pred_prob = log_clf1.predict_proba(X_test1)
y_full_log_clf1_pred_prob = log_clf1.predict_proba(X_full1)
y_validate_log_clf1_pred_prob = log_clf1.predict_proba(X_validate1)

print('log_clf1 train', f1_score(y_train1, y_train_log_clf1_pred))
print('log_clf1 test ', f1_score(y_test1, y_test_log_clf1_pred))
print('log_clf1 full: ', f1_score(y_full, y_full_log_clf1_pred))

log_clf1 train 0.9303382457962736
log_clf1 test  0.9080063626723223
log_clf1 full:  0.9192876586533731


# Логистическая регрессия на count_vectorizer

In [205]:
log_clf2 = LogisticRegression(random_state=42, max_iter=150).fit(X_train2, y_train2)

y_train_log_clf2_pred = log_clf2.predict(X_train2)
y_test_log_clf2_pred = log_clf2.predict(X_test2)
y_full_log_clf2_pred = log_clf2.predict(X_full2)
y_validate_log_clf2_pred = log_clf2.predict(X_validate2)

y_train_log_clf2_pred_prob = log_clf2.predict_proba(X_train2)
y_test_log_clf2_pred_prob = log_clf2.predict_proba(X_test2)
y_full_log_clf2_pred_prob = log_clf2.predict_proba(X_full2)
y_validate_log_clf2_pred_prob = log_clf2.predict_proba(X_validate2)

print('log_clf2 train', f1_score(y_train2, y_train_log_clf2_pred))
print('log_clf2 test ', f1_score(y_test2, y_test_log_clf2_pred))
print('log_clf2 full: ', f1_score(y_full, y_full_log_clf2_pred))

log_clf2 train 0.9812415154880909
log_clf2 test  0.9630552613544326
log_clf2 full:  0.9721920823387171


# Логистическая регрессия на url

In [206]:
log_clf3 = LogisticRegression(random_state=42, max_iter=300).fit(X_train3, y_train3)

y_train_log_clf3_pred = log_clf3.predict(X_train3)
y_test_log_clf3_pred = log_clf3.predict(X_test3)
y_full_log_clf3_pred = log_clf3.predict(X_full3)
y_validate_log_clf3_pred = log_clf3.predict(X_validate3)

y_train_log_clf3_pred_prob = log_clf3.predict_proba(X_train3)
y_test_log_clf3_pred_prob = log_clf3.predict_proba(X_test3)
y_full_log_clf3_pred_prob = log_clf3.predict_proba(X_full3)
y_validate_log_clf3_pred_prob = log_clf3.predict_proba(X_validate3)

print('log_clf3 train', f1_score(y_train3, y_train_log_clf3_pred))
print('log_clf3 test ', f1_score(y_test3, y_test_log_clf3_pred))
print('log_clf3 full: ', f1_score(y_full, y_full_log_clf3_pred))

log_clf3 train 0.9671215880893301
log_clf3 test  0.9589143893440059
log_clf3 full:  0.9630344741677307


# Перцептрон на всём

In [207]:
prc_clf0 = Perceptron(tol=1e-3, random_state=42).fit(X_train0, y_train0)

y_train_prc_clf0_pred = prc_clf0.predict(X_train0)
y_test_prc_clf0_pred = prc_clf0.predict(X_test0)
y_full_prc_clf0_pred = prc_clf0.predict(X_full0)
y_validate_prc_clf0_pred = prc_clf0.predict(X_validate0)

# y_train_prc_clf0_pred_prob = prc_clf0.predict_proba(X_train0)
# y_test_prc_clf0_pred_prob = prc_clf0.predict_proba(X_test0)
# y_full_prc_clf0_pred_prob = prc_clf0.predict_proba(X_full0)
# y_validate_prc_clf0_pred_prob = prc_clf0.predict_proba(X_validate0)

print('prc_clf0 train', f1_score(y_train0, y_train_prc_clf0_pred))
print('prc_clf0 test ', f1_score(y_test0, y_test_prc_clf0_pred))
print('prc_clf0 full: ', f1_score(y_full, y_full_prc_clf0_pred))

prc_clf0 train 0.9992691839220462
prc_clf0 test  0.9795793592565419
prc_clf0 full:  0.9894434952404199


# Перцептрон на tf-idf

In [208]:
prc_clf1 = Perceptron(tol=1e-3, random_state=42).fit(X_train1, y_train1)

y_train_prc_clf1_pred = prc_clf1.predict(X_train1)
y_test_prc_clf1_pred = prc_clf1.predict(X_test1)
y_full_prc_clf1_pred = prc_clf1.predict(X_full1)
y_validate_prc_clf1_pred = prc_clf1.predict(X_validate1)

# y_train_prc_clf1_pred_prob = prc_clf1.predict_proba(X_train1)
# y_test_prc_clf1_pred_prob = prc_clf1.predict_proba(X_test1)
# y_full_prc_clf1_pred_prob = prc_clf1.predict_proba(X_full1)
# y_validate_prc_clf1_pred_prob = prc_clf1.predict_proba(X_validate1)

print('prc_clf1 train', f1_score(y_train1, y_train_prc_clf1_pred))
print('prc_clf1 test ', f1_score(y_test1, y_test_prc_clf1_pred))
print('prc_clf1 full: ', f1_score(y_full, y_full_prc_clf1_pred))

prc_clf1 train 0.998478115297985
prc_clf1 test  0.9508014109784022
prc_clf1 full:  0.9748358190634016


# Перцептрон на count

In [209]:
prc_clf2 = Perceptron(tol=1e-3, random_state=42).fit(X_train2, y_train2)

y_train_prc_clf2_pred = prc_clf2.predict(X_train2)
y_test_prc_clf2_pred = prc_clf2.predict(X_test2)
y_full_prc_clf2_pred = prc_clf2.predict(X_full2)
y_validate_prc_clf2_pred = prc_clf2.predict(X_validate2)

# y_train_prc_clf2_pred_prob = prc_clf2.predict_proba(X_train2)
# y_test_prc_clf2_pred_prob = prc_clf2.predict_proba(X_test2)
# y_full_prc_clf2_pred_prob = prc_clf2.predict_proba(X_full2)
# y_validate_prc_clf2_pred_prob = prc_clf2.predict_proba(X_validate2)

print('prc_clf2 train', f1_score(y_train2, y_train_prc_clf2_pred))
print('prc_clf2 test ', f1_score(y_test2, y_test_prc_clf2_pred))
print('prc_clf2 full: ', f1_score(y_full, y_full_prc_clf2_pred))

prc_clf2 train 0.9980513944708317
prc_clf2 test  0.961564232198331
prc_clf2 full:  0.9798105910655013


# Данные с регрессиями

In [216]:
# Данные
X_full_log = hstack([
                    #train_titles_tfidf,
                    train_titles_count,  
                    train_urls_tfidf,
                    train_df_m.iloc[:,7:],
                    y_full_log_clf1_pred_prob,
                    y_full_log_clf2_pred_prob,
                    y_full_log_clf3_pred_prob
                    ]).tocsr()
X_validate_log = hstack([
                    #validate_titles_tfidf,
                    validate_titles_count, 
                    validate_urls_tfidf, 
                    validate_df_m.iloc[:,6:],
                    y_validate_log_clf1_pred_prob,
                    y_validate_log_clf2_pred_prob,
                    y_validate_log_clf3_pred_prob
                    ]).tocsr()

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_full_log, y_full, test_size=0.2, random_state=42)

# Перцептрон на регрессиях

In [217]:
prc_log_clf = Perceptron(tol=1e-3, random_state=42).fit(X_train_log, y_train_log)

y_train_prc_log_clf_pred = prc_log_clf.predict(X_train_log)
y_test_prc_log_clf_pred = prc_log_clf.predict(X_test_log)
y_full_prc_log_clf_pred = prc_log_clf.predict(X_full_log)
y_validate_prc_log_clf_pred = prc_log_clf.predict(X_validate_log)

print('rf_log_clf train', f1_score(y_train_log, y_train_prc_log_clf_pred))
print('rf_log_clf test ', f1_score(y_test_log, y_test_prc_log_clf_pred))
print('rf_log_clf full: ', f1_score(y_full, y_full_prc_log_clf_pred))

y_full_prc_log_clf_error = np.arange(y_full.size)[y_full_prc_log_clf_pred != y_full]

rf_log_clf train 0.9990467111534795
rf_log_clf test  0.9851432072292847
rf_log_clf full:  0.9962752640898822


# Лес на регрессиях

In [218]:
rf_log_clf = RandomForestClassifier(random_state=42, n_estimators=20, n_jobs = 2).fit(X_train_log, y_train_log)
#rf_log_clf = RandomForestClassifier(random_state=42, n_estimators=40, n_jobs = 2).fit(X_full_log, y_full)

y_train_rf_log_clf_pred = rf_log_clf.predict(X_train_log)
y_test_rf_log_clf_pred = rf_log_clf.predict(X_test_log)
y_full_rf_log_clf_pred = rf_log_clf.predict(X_full_log)
y_validate_rf_log_clf_pred = rf_log_clf.predict(X_validate_log)

print('rf_log_clf train', f1_score(y_train_log, y_train_rf_log_clf_pred))
print('rf_log_clf test ', f1_score(y_test_log, y_test_rf_log_clf_pred))
print('rf_log_clf full: ', f1_score(y_full, y_full_rf_log_clf_pred))

rf_log_clf train 0.9997330384043326
rf_log_clf test  0.9915863545969099
rf_log_clf full:  0.9981073325599853


# Регрессия на регрессиях

In [220]:
#log_log_clf = LogisticRegression(random_state=42, max_iter=300).fit(X_train_log, y_train_log)
log_log_clf = LogisticRegression(random_state=42, max_iter=300).fit(X_full_log, y_full)

y_train_log_log_clf_pred = log_log_clf.predict(X_train_log)
y_test_log_log_clf_pred = log_log_clf.predict(X_test_log)
y_full_log_log_clf_pred = log_log_clf.predict(X_full_log)
y_validate_log_log_clf_pred = log_log_clf.predict(X_validate_log)

print('log_log_clf train', f1_score(y_train_log, y_train_log_log_clf_pred))
print('log_log_clf test ', f1_score(y_test_log, y_test_log_log_clf_pred))
print('log_log_clf full: ', f1_score(y_full, y_full_log_log_clf_pred))

log_log_clf train 0.9961404715503076
log_log_clf test  0.9951025405570859
log_log_clf full:  0.9959330948231049


# Итого

In [222]:
# А теперь слепим Ту-Самую-Классификацию

y_validate = y_validate_rf_log_clf_pred
#y_validate = y_validate_log_log_clf_pred

final_df = pd.DataFrame({'id' : validate_df_m['id'], 'target' : y_validate})
print(final_df)
final_df.to_csv('final.csv', index=False)

            id  target
0       135309   False
1       135310   False
2       135311   False
3       135312    True
4       135313   False
...        ...     ...
165373  300682   False
165374  300683   False
165375  300684   False
165376  300685   False
165377  300686   False

[165378 rows x 2 columns]
