## pymorphy2 + tokenize_normalize + Pipeline(CountVectorizer, TfidfTransformer, SGDClassifier) + grid_search

In [1]:
MODE='+'

In [2]:
import numpy as np
import pandas as pd

In [35]:
train_df = pd.read_csv("train.csv")
train_df['title'] = train_df['title'].str.lower()
train_df['url'] = train_df['url'].str.lower()
train_df['url'].fillna(train_df['title'], inplace=True)
train_df['n'] = train_df['url'] + ' ' +  train_df['title']
train_df.head()

Unnamed: 0,id,url,title,target,n
0,0,m.kp.md,"экс-министр экономики молдовы - главе мидэи, ц...",False,m.kp.md экс-министр экономики молдовы - главе ...
1,1,www.kp.by,эта песня стала известна многим телезрителям б...,False,www.kp.by эта песня стала известна многим теле...
2,2,fanserials.tv,банши 4 сезон 2 серия бремя красоты смотреть о...,False,fanserials.tv банши 4 сезон 2 серия бремя крас...
3,3,colorbox.spb.ru,не беси меня картинки,False,colorbox.spb.ru не беси меня картинки
4,4,tula-sport.ru,в новомосковске сыграют следж-хоккеисты алекси...,False,tula-sport.ru в новомосковске сыграют следж-хо...


In [4]:
test_df = pd.read_csv("test.csv")
test_df['title'] = test_df['title'].str.lower()
test_df['url'] = test_df['url'].str.lower()

test_df.head()

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,шестой кассационный суд в самаре начнет работу...
1,135310,urexpert.online,"что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,женщинам | империя меха - part 12
3,135312,national-porn.com,"небритые, волосатые киски: порно всех стран и ..."
4,135313,2gis.ru,67


In [24]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [25]:
import pymorphy2
import re

morph = pymorphy2.MorphAnalyzer()
retoken = re.compile(r'[\'\w\-]+')

In [26]:
def tokenize_normalize(text):
    text = retoken.findall(text.lower())
    text = [morph.parse(x)[0].normal_form for x in text]
    return ' '.join(text)

In [8]:
train_df['title'] = train_df['title'].astype(str)
train_df['url'] = train_df['url'].astype(str)
train_df['n'] = train_df['n'].astype(str)
train_df.head()

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"экс-министр экономики молдовы - главе мидэи, ц...",False
1,1,www.kp.by,эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,банши 4 сезон 2 серия бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,не беси меня картинки,False
4,4,tula-sport.ru,в новомосковске сыграют следж-хоккеисты алекси...,False


In [27]:
%%time
train_df['n_norm'] = train_df['n'].apply(tokenize_normalize)

CPU times: user 4min 46s, sys: 613 ms, total: 4min 47s
Wall time: 4min 48s


In [20]:
%%time
train_df['url_norm'] = train_df['url'].apply(tokenize_normalize)

CPU times: user 8.54 s, sys: 35.5 ms, total: 8.58 s
Wall time: 8.69 s


In [21]:
%%time
test_df['title_norm'] = test_df['title'].apply(tokenize_normalize)

CPU times: user 5min 44s, sys: 390 ms, total: 5min 44s
Wall time: 5min 45s


In [22]:
%%time
test_df['url_norm'] = test_df['url'].apply(tokenize_normalize)

CPU times: user 11.5 s, sys: 12.1 ms, total: 11.5 s
Wall time: 11.6 s


In [23]:
train_df.head()

Unnamed: 0,id,url,title,target,title_norm,url_norm
0,0,m.kp.md,"экс-министр экономики молдовы - главе мидэи, ц...",False,экс-министр экономика молдова - глава мидэя це...,m kp md
1,1,www.kp.by,эта песня стала известна многим телезрителям б...,False,этот песня стать известный многий телезритель ...,www kp by
2,2,fanserials.tv,банши 4 сезон 2 серия бремя красоты смотреть о...,False,банша 4 сезон 2 серия бремя красота смотреть о...,fanserials tv
3,3,colorbox.spb.ru,не беси меня картинки,False,не бесить я картинка,colorbox spb ru
4,4,tula-sport.ru,в новомосковске сыграют следж-хоккеисты алекси...,False,в новомосковск сыграть следж-хоккеист алексинс...,tula-sport ru


In [24]:
test_df.head()

Unnamed: 0,id,url,title,title_norm,url_norm
0,135309,www.kommersant.ru,шестой кассационный суд в самаре начнет работу...,шесть кассационный суд в самар начать работа в...,www kommersant ru
1,135310,urexpert.online,"что такое индексация алиментов, кем и в каких ...",что такой индексация алименты кто и в какой сл...,urexpert online
2,135311,imperimeha.ru,женщинам | империя меха - part 12,женщина империя мех - part 12,imperimeha ru
3,135312,national-porn.com,"небритые, волосатые киски: порно всех стран и ...",небритый волосатый киска порно весь страна и н...,national-porn com
4,135313,2gis.ru,67,67,2gis ru


In [37]:
if MODE == '+':
    X_train, X_test, y_train, y_test = train_test_split(
    train_df[["n"]], train_df["target"].astype(int).values, test_size=0.2, random_state=42)
else:
    X_train = train_df[["url_norm", "title_norm"]]
    y_train = train_df["target"].astype(int).values
    X_test = test_df[["url_norm", "title_norm"]]

In [38]:
X_train.shape



(108247, 1)

In [39]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier( penalty= None),)
                    ])

In [41]:
a = X_train["n"].values

b = X_test["n"].values

aaaaaa = text_clf.fit(a, y_train)
predicted = aaaaaa.predict(b)

In [42]:
print(f1_score(predicted, y_test))
print(metrics.classification_report(predicted, y_test))

0.9645347074871173


NameError: name 'metrics' is not defined

In [32]:
aa = X_train["url_norm"].values
bb = X_test["url_norm"].values


predicted_url = text_clf.fit(aa, y_train).predict(bb)

In [33]:
print(f1_score(predicted_url, y_test))
print(metrics.classification_report(predicted_url, y_test))


0.8747130206625122
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     24304
           1       0.80      0.97      0.87      2758

    accuracy                           0.97     27062
   macro avg       0.90      0.97      0.93     27062
weighted avg       0.98      0.97      0.97     27062



In [34]:
y_pred = [int("webcam" in url or "porno" in url or "xxx" in url or 'sex' in url ) for url in bb]
y_pred1 = [int("мастурбировать" in text or "пенис" in text or "проститутка" in text or "елдак" in text or "ебёт" in text or "член в" in text or "выебал" in text or "xxx" in text and not "xxxtentac" in text or "porn" in text or "sex" in text) for text in b]
print(sum(y_pred))
print(sum(y_pred1))

1110
1475


In [35]:
print(f1_score(y_pred, y_test))
print(metrics.classification_report(y_pred, y_test))

0.49617977528089885
              precision    recall  f1-score   support

           0       1.00      0.91      0.95     25952
           1       0.33      0.99      0.50      1110

    accuracy                           0.92     27062
   macro avg       0.67      0.95      0.73     27062
weighted avg       0.97      0.92      0.94     27062



In [36]:
print(f1_score(y_pred1, y_test)) 
print(metrics.classification_report(y_pred1, y_test))

0.6076843198338526
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     25587
           1       0.44      0.99      0.61      1475

    accuracy                           0.93     27062
   macro avg       0.72      0.96      0.78     27062
weighted avg       0.97      0.93      0.94     27062



In [37]:
print(f1_score(predicted_url | predicted | y_pred | y_pred1, y_test))
print(metrics.classification_report(predicted_url | predicted | y_pred | y_pred1, y_test))


0.9640140433001754
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     23566
           1       0.99      0.94      0.96      3496

    accuracy                           0.99     27062
   macro avg       0.99      0.97      0.98     27062
weighted avg       0.99      0.99      0.99     27062



In [38]:
print(f1_score(predicted | ((np.array(predicted_url)+np.array(y_pred))//2) | ((np.array(y_pred)+np.array(y_pred1))//2) | ((np.array(predicted_url)+np.array(y_pred1))//2), y_test))
print(metrics.classification_report(predicted | ((np.array(predicted_url)+np.array(y_pred))//2) | ((np.array(y_pred)+np.array(y_pred1))//2) | ((np.array(predicted_url)+np.array(y_pred1))//2), y_test))



0.9598315029336543
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     23755
           1       0.96      0.96      0.96      3307

    accuracy                           0.99     27062
   macro avg       0.98      0.98      0.98     27062
weighted avg       0.99      0.99      0.99     27062



In [2]:
pd.options.display.max_rows = 181

X_test[(predicted_url | predicted | y_pred | y_pred1 != y_test)]

In [40]:
print(metrics.classification_report(predicted_url | predicted | y_pred | y_pred1, y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     23566
           1       0.99      0.94      0.96      3496

    accuracy                           0.99     27062
   macro avg       0.99      0.97      0.98     27062
weighted avg       0.99      0.99      0.99     27062



In [None]:
test_df["target"] = (predicted_url | predicted | y_pred | y_pred1).astype(bool)

test_df[["id", "target"]].to_csv("Normalize_SGDC_penalty_None.csv", index=False)

In [None]:
answer = pd.read_csv("Normalize_SGDC_penalty_None.csv")
answer.shape

In [30]:
from sklearn import metrics

In [None]:
%%time
for penalty_p in [ 'l2', 'l1', 'elasticnet', 'None']:
    for loss_p in ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']:
                text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                                 ('tfidf', TfidfTransformer()),
                                 ('MNB',  SGDClassifier( n_jobs=-1, penalty=penalty_p, loss = loss_p) ),
                ])
                a = X_train["title_norm"].values
                b = X_test["title_norm"].values
                aaaaaa = text_clf.fit(a, y_train)
                predicted = aaaaaa.predict(b)
                print('penalty =',penalty_p,' loss =' ,loss_p, ' - ', f1_score(predicted, y_test))
                print(metrics.classification_report(predicted, y_test))

# MNB

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
text_MNB = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('MNB',  MultinomialNB(alpha = 0.115),) #0.9532083523853071
                    ])

In [43]:
a_MNB = X_train["title_norm"].values

b_MNB = X_test["title_norm"].values

aaaaaa_MNB = text_MNB.fit(a_MNB, y_train)
predicted_MNB = aaaaaa_MNB.predict(b_MNB)
aaaaaa_MNB

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('MNB',
                 MultinomialNB(alpha=0.115, class_prior=None, fit_prior=True))],
         verbose=False)

In [44]:
print(f1_score(predicted_MNB, y_test))

0.9202417809228953


## Обучение на двух колонках


In [239]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



In [240]:
def first_column(X):
    return X.iloc[:, 0]

def second_column(X):
    return X.iloc[:, 1]



In [241]:
# pipeline to get all tfidf and word count for first column
pipeline_one = Pipeline([
    ('column_selection', FunctionTransformer(first_column, validate=False)),
    ('feature-extractors', FeatureUnion([('tfidf', TfidfVectorizer()),
                                        ('counts', CountVectorizer())

    ]))
])

# Then a second pipeline to do the same for the second column
pipeline_two = Pipeline([
    ('column_selection', FunctionTransformer(second_column, validate=False)),
    ('feature-extractors', FeatureUnion([('tfidf', TfidfVectorizer()),
                                        ('counts', CountVectorizer())

    ]))
])


In [1]:
# Then you would again feature union these pipelines 
# to get different feature selection for each column
final_transformer = FeatureUnion([('first-column-features', pipeline_one),
                                  ('second-column-feature', pipeline_two),
                                  ('clf', SGDClassifier()),
                                 ])

In [244]:
final_transformer.fit_transform(X_train)

<108247x309430 sparse matrix of type '<class 'numpy.float64'>'
	with 2451606 stored elements in Compressed Sparse Row format>

In [3]:
predicted_S = final_transformer.predict(X_test)