In [1]:
import numpy as np
import pandas as pd
import re
import pymorphy2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
import scipy as sp

nltk.download('stopwords')
stopWords = stopwords.words('russian')

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

[nltk_data] Downloading package stopwords to /home/sait/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
morph = pymorphy2.MorphAnalyzer()

def tokenize_url(url:str):   
    url = re.sub("(\W|_)+"," ",url)
    return url

def pymorphy_text(text:str):
    lst = text.split()
    lst_new = [morph.parse(str(word))[0].normal_form for word in lst]
    new_text = ' '.join(lst_new)
    return new_text

train_df['tokenized_url']=train_df['url'].apply(lambda x: tokenize_url(x))
train_df['text'] = train_df['tokenized_url'] + ' ' + train_df['title']
train_df['text']=train_df['text'].apply(lambda x:pymorphy_text(x))

test_df['tokenized_url']=test_df['url'].apply(lambda x: tokenize_url(x))
test_df['text'] = test_df['tokenized_url'] + ' ' + test_df['title']
test_df['text']=test_df['text'].apply(lambda x: pymorphy_text(x))

stopWords = [morph.parse(str(word))[0].normal_form for word in stopWords]

In [3]:
# Тестовое разделение
# x_train, x_test, y_train, y_test = train_test_split(train_df["text"].values, train_df["target"].values, test_size=0.35, random_state=42)
# url_train, url_test, _, _ = train_test_split(train_df["url"].values, train_df["target"].values, test_size=0.35, random_state=42)

In [3]:
# Жизненное разделение 
x_train, y_train, x_test = train_df["text"].values, train_df["target"].values, test_df["text"].values
url_train, url_test = train_df["url"].values, test_df['url'].values

In [4]:
vectorizer = CountVectorizer(min_df = 3, analyzer='word', strip_accents='unicode',
                                                 encoding='utf-16', stop_words = stopWords)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
x_train_v = vectorizer.fit_transform(x_train)
x_train_t = tfidf_transformer.fit_transform(x_train_v)
all_in_SelectKBest = x_train_t.shape[1]

  'stop_words.' % sorted(inconsistent))


In [5]:
from sklearn import preprocessing

def get_additional_features(data):
    
    
    a5 = np.asarray([[i.count('https') for i in data]])
    a6 = np.asarray([[i.count('xxx') for i in data]])
    a7 = np.asarray([[i.count('devk') for i in data]])
    a8 = np.asarray([[i.count('hd') for i in data]])
    a9 = np.asarray([[i.count('club') for i in data]])
    a10 = np.asarray([[i.count('hub') for i in data]])
    a11 = np.asarray([[i.count('sex') for i in data]])
    a12 = np.asarray([[i.count('girl') for i in data]])
    a13 = np.asarray([[i.count('seks') for i in data]])
    a14 = np.asarray([[i.count('ebl') for i in data]])
    a15 = np.asarray([[i.count('video') for i in data]])
    a16 = np.asarray([[i.count('24') for i in data]])

    mas = np.concatenate([a5, a6, a7, a8, a9, a10, a11, a12,
                           a13, a14, a15, a16]).T
    return preprocessing.normalize(mas)

In [6]:
# LogisticRegression

class MyTrasform:
    train = 0
    def transform(self, X, **fit_params):
        MyTrasform.train += 1
        xx=get_additional_features(url_train if MyTrasform.train == 1 else url_test)
        new_X = X
        new_X = sp.sparse.hstack((new_X, xx))
        return new_X

    def fit_transform(self, X, y=None,  **fit_params):
        self.fit(X, y,  **fit_params)
        return self.transform(X)

    def fit(self, X, y, **fit_params):
        return self 
    
    
text_clf_LR = Pipeline([('vect', CountVectorizer(min_df = 3, analyzer='word', strip_accents='unicode',
                                                 encoding='utf-16', stop_words = stopWords)),
                      ('tfidf', TfidfTransformer(sublinear_tf=True)),
                      ('add_url', MyTrasform()),
                      ('feat_select', SelectKBest(chi2, k=int(all_in_SelectKBest * 0.97))),
                      ('clf', LogisticRegression(dual = True, verbose=3, solver='liblinear',
                                                 random_state=0, C=83, penalty='l2', max_iter=10000)),
])


text_clf_LR.fit(x_train, y_train)
predicted_LR = text_clf_LR.predict(x_test)
# print(f1_score(y_test, predicted_LR))

  'stop_words.' % sorted(inconsistent))


[LibLinear]1


In [7]:
# Выводим результаты:
predicted_LR = text_clf_LR.predict(x_test)
test_df["target"] = predicted_LR.astype(bool)
test_df[["id", "target"]].to_csv("ml_baseline.csv", index=False)
!cat ml_baseline.csv | head

id,target
135309,False
135310,False
135311,False
135312,True
135313,False
135314,False
135315,False
135316,False
135317,False
cat: ошибка записи: Обрыв канала
