In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # Работает только с английскими словами
from pymystem3 import Mystem
import pymorphy2
import re

In [2]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/timofey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/timofey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Загрузка данных
train_data = pd.read_csv('train.csv')

In [339]:
train_data

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False
...,...,...,...,...
135304,135304,mail.ru,пора тюльпанов турецкий сериал на русском язык...,False
135305,135305,www.ntv.ru,Остросюжетный сериал «Шеф. Игра на повышение»....,False
135306,135306,topclassiccarsforsale.com,"1941 Plymouth Special Deluxe Hot Rod, Automati...",False
135307,135307,wowcream.ru,Купить It's Skin Сыворотка питательная Power 1...,False


In [178]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("бегущие")

'бегущие'

In [179]:
lemmatizer = pymorphy2.MorphAnalyzer()
lemmatizer.parse("бегущие")[0].normal_form

'бежать'

In [180]:
m = Mystem()
''.join(m.lemmatize("бегущие")).strip()

'бежать'

In [340]:
stop_words = set(stopwords.words('russian'))
# stop_words = set(stopwords.words('russian')) | set(stopwords.words('english'))

def clean_data(data):
    data.drop_duplicates(inplace=True)
    data.dropna(inplace=True)
    data['title'] = data['title'].apply(lambda x: x.lower())
    # Удаляем все, кроме слов и цифр с пробелами
    data['title'] = data['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) 
    data['title'] = data['title'].apply(lambda x: re.sub(r'\d+', '', x)) 

    return data

### Разные лемматизации

In [341]:
def WordNetLemmatize(train_data):
    lemmatizer = WordNetLemmatizer()
    return train_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))

In [342]:
def MystemLemmatize(train_data):
    # Инициализация объекта для лемматизации
    mystem = Mystem()

    def preprocess_text(text):
        # Лемматизация
        lemmas = mystem.lemmatize(text)
        # Удаление стоп-слов
        lemmas = [lemma.strip() for lemma in lemmas if lemma not in stop_words]
        # Склеивание лемм в одну строку
        text = ' '.join(lemmas)
        return text

    return train_data.apply(lambda text: preprocess_text(text))

In [343]:
def pymorphy2Lemmatize(train_data):
    lemmatizer = pymorphy2.MorphAnalyzer()

    return train_data.apply(
        lambda x: ' '.join([lemmatizer.parse(word)[0].normal_form for word in x.split() if word not in stop_words])
    )

In [344]:
train_data = clean_data(train_data)

In [345]:
train_data['title'] = pymorphy2Lemmatize(train_data['title'])

### Преобразование текста в числа

In [308]:
# train_data['text'] = train_data['title'].apply(lambda x: x + ' ') + train_data['url']
# Преобразование текста в числа
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_data['title'])
y = train_data['target']

# Разбиение данных на обучающую и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [188]:
feature_names = np.array(list(vectorizer.vocabulary_.keys()))
print(feature_names)

['эксминистр' 'экономика' 'молдова' ... 'blockpower' 'effector' 'wowcream']


In [143]:
id_ = 42

print(X_train[id_])

x_vector = X.getrow(id_).toarray()[0]

[feature for feature in feature_names[x_vector > 0]]

Презентация на тему "Приближенное значение. Абсолютная и относительная погрешнос


['гидромассажный',
 'olgafisakovamailru',
 'юнитекс',
 'размешать',
 'alvares',
 'grou',
 'кренк']

In [347]:
# Обучение модели
model = LogisticRegression()
model.fit(X_train, y_train)

# Оценка качества модели на валидационной выборке

In [310]:
y_pred = model.predict(X_train)
score = f1_score(y_train, y_pred)
print("F1-score на тестовой выборке: {:.3f}".format(score))

y_pred_val = model.predict(X_val)
score = f1_score(y_val, y_pred_val)
print("F1-score на валидационной выборке: {:.3f}".format(score))

F1-score на тестовой выборке: 0.989
F1-score на валидационной выборке: 0.969


## Соединяем обработки текста в отдельную функцию

In [245]:
def train_model(train_data, with_report = False,
                model=LogisticRegression(),
                Lemmatizer = None, 
                stop_words = set(stopwords.words('russian')),
               ):
    # Очищаем
    train_data = clean_data(train_data)
    # Лемматизируем
    if (Lemmatizer is not None):
        train_data['title'] = Lemmatizer(train_data['title'])
    # Приводим к числам
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(train_data['title'])
    y = train_data['target']
    
    # Разбиение данных на обучающую и валидационную выборки
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Обучение модели
    model.fit(X_train, y_train)
    
    if (with_report):
        y_pred = model.predict(X_train)
        score = f1_score(y_train, y_pred)
        print("F1-score на тестовой выборке: {:.3f}".format(score))

        y_pred_val = model.predict(X_val)
        score = f1_score(y_val, y_pred_val)
        print("F1-score на валидационной выборке: {:.3f}".format(score))
    
    return model

In [256]:
model_pymorphy2Lemmatize = train_model(train_data, with_report=True, Lemmatizer=pymorphy2Lemmatize, stop_words=stop_words_merged)

F1-score на тестовой выборке: 0.971
F1-score на валидационной выборке: 0.945


In [257]:
model_pymorphy2Lemmatize = train_model(train_data, with_report=True, Lemmatizer=pymorphy2Lemmatize)

F1-score на тестовой выборке: 0.971
F1-score на валидационной выборке: 0.945


In [258]:
model_MystemLemmatize = train_model(train_data, with_report=True, Lemmatizer=MystemLemmatize)

F1-score на тестовой выборке: 0.971
F1-score на валидационной выборке: 0.945


In [238]:
from nltk.stem import WordNetLemmatizer
model_WordNetLemmatize = train_model(train_data, with_report=True, Lemmatizer=WordNetLemmatize)

F1-score на тестовой выборке: 0.971
F1-score на валидационной выборке: 0.939


In [246]:
model_without_lemmatize = train_model(train_data, with_report=True)

F1-score на тестовой выборке: 0.971
F1-score на валидационной выборке: 0.939


## Попробуем трансформеры

In [18]:
from transformers import DistilBertTokenizerFast, DistilBertModel
import torch

# Инициализация токенизатора и модели
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Обработка текста и получение эмбеддингов
text = "This is a sample text to be embedded."
inputs = tokenizer(text, return_tensors='pt', truncation=True)
outputs = model(**inputs)
embeddings = outputs.last_hidden_state


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Загрузка данных
train_data = pd.read_csv('train.csv')

# Инициализация токенизатора и модели
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
# Обработка текста и получение эмбеддингов
i = 0
X = []
for text in train_data['title']:
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    X.append(embeddings.detach().numpy())
    if (i%1000 == 0):
        print(f"Iteration {i} /",train_data['title'].shape[0], f"{i/train_data['title'].shape[0]}%")
    i+=1
X = pd.DataFrame(X)

Iteration 0 / 135309 0.0%


In [17]:
X.to_csv('embeddings.csv')

In [83]:
import os
import pandas as pd

# Чтение из файла, если файл существует
if os.path.isfile('embeddings.csv'):
    X = pd.read_csv('embeddings.csv', index_col=0)
else:
    X = pd.DataFrame()

In [104]:
start_idx = X.shape[0]
# Обработка текста и получение эмбеддингов
for i, text in enumerate(train_data['title'][start_idx:start_idx+20]):
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
#     X = pd.DataFrame(embeddings.detach().numpy()).T
    X.loc[i] = embeddings.detach().numpy()
    if (i % 10 == 0):
        print(f"Iteration {i+start_idx} / {train_data['title'].shape[0]} ({(i+start_idx)/train_data['title'].shape[0]*100:.2f}%)")
        # Запись в файл на каждой 1000-й итерации
        X.to_csv('embeddings.csv')        
# Запись оставшихся данных в файл
X.to_csv('embeddings.csv')

Iteration 20 / 135309 (0.01%)
Iteration 30 / 135309 (0.02%)


In [105]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.130314,0.005657,0.108823,-0.04022,0.579633,0.14707,-0.058492,0.342769,-0.002201,-0.022413,...,0.040137,-0.315638,0.343903,-0.47185,0.218975,-0.796071,-0.057456,-0.18864,0.610182,0.159787
1,-0.077997,-0.103543,0.090305,0.03143,0.561107,0.140605,-0.112423,0.485146,-0.077454,-0.183468,...,0.094906,-0.461445,0.184997,-0.397397,0.094669,-0.684273,0.087932,-0.117875,0.593566,0.211949
2,-0.11348,-0.040033,0.061344,0.031506,0.671913,0.149452,-0.187717,0.498494,-0.027705,-0.027285,...,-0.041938,-0.346197,0.43911,-0.33425,0.187062,-0.893234,-0.104641,-0.212868,0.586935,0.105043
3,-0.056768,0.088331,-0.115944,-0.165607,0.203022,-0.086711,0.010639,0.257392,-0.011821,-0.039709,...,-0.034242,-0.12922,-0.052545,-0.281699,0.073281,-0.232753,-0.129449,-0.238482,0.194678,0.286459
4,-0.117689,-0.098321,0.058717,-0.034988,0.52678,0.208661,-0.046965,0.394215,-0.02243,-0.163613,...,0.053588,-0.463544,0.278821,-0.391382,0.173528,-0.838697,0.01465,-0.14631,0.666737,0.055644
5,-0.069972,0.002838,0.060972,-0.085899,0.574031,0.206851,0.043749,0.470226,0.124416,-0.155512,...,-0.092336,-0.307095,0.250008,-0.240241,0.08865,-0.902267,-0.14519,-0.374592,0.426418,-0.00596
6,-0.152178,-0.152408,-0.098346,-0.037491,0.598458,0.094544,-0.11698,0.492469,-0.048189,-0.228956,...,0.027289,-0.398486,0.319981,-0.416669,0.174291,-0.646413,0.021024,-0.254155,0.449767,-0.030907
7,0.07297,-0.154533,-0.03714,-0.067041,0.580667,0.242237,-0.083023,0.591747,-0.061652,-0.092554,...,-0.084779,-0.512956,0.344828,-0.356376,0.157623,-1.023206,-0.009973,-0.232336,0.558031,0.019852
8,-0.091318,-0.079259,0.030702,-0.021458,0.611106,0.209741,-0.001717,0.541909,-0.068774,-0.126115,...,-0.020999,-0.515429,0.286397,-0.42351,0.096842,-0.87514,0.036671,-0.235448,0.606358,0.057077
9,-0.170166,0.054747,0.195478,-0.050119,0.495358,0.064851,-0.025543,0.257664,0.093114,-0.049816,...,-0.131567,-0.283757,0.450232,-0.444589,0.209589,-1.037264,-0.196315,-0.21121,0.611195,0.081018


In [7]:
# Обучение модели
y = train_data['target'][:10000]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Оценка качества модели
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
f1_train = f1_score(y_train, y_pred_train)
f1_val = f1_score(y_val, y_pred_val)
print('F1-score на тренировочной выборке:', f1_train)
print('F1-score на валидационной выборке:', f1_val)

F1-score на тренировочной выборке: 0.7992678462477121
F1-score на валидационной выборке: 0.6945812807881773


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [306]:
test_df = pd.read_csv("test.csv")

False    148569
True      16809
Удаление повторных слов:
False    148499
True      16879
Добавление url в список слов: при этом keggle стал хуже
False    148878
True      16500

### Submit

In [349]:
# X_test_vectorized = vectorizer.transform(test_df["title"].values)
X_test_vectorized = vectorizer.transform(test_df["title"])

test_df["target"] = model.predict(X_test_vectorized).astype(bool)

counts = test_df["target"].value_counts()
print(counts)

test_df[["id", "target"]].to_csv("ml_baseline.csv", index=False)

!cat ml_baseline.csv | head

False    148878
True      16500
Name: target, dtype: int64
id,target
135309,False
135310,False
135311,False
135312,True
135313,False
135314,False
135315,False
135316,False
135317,False
cat: write error: Broken pipe
