https://www.kaggle.com/competitions/nlp-getting-started

# Проект: "предсказание типа комментариев: о реальных катастрофах и нереальных катастрофах."

# Импортирование библиотек

In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

import re

import nltk
from collections import defaultdict

from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Предварительный анализ и подготовка данных

In [135]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(columns = ['keyword', 'location'])
test = test.drop(columns = ['keyword', 'location'])

df = pd.concat([train, test])

In [136]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,Forest fire near La Ronge Sask. Canada,1.0
2,5,All residents asked to 'shelter in place' are ...,1.0
3,6,"13,000 people receive #wildfires evacuation or...",1.0
4,7,Just got sent this photo from Ruby #Alaska as ...,1.0


In [137]:
print('Форма train: {}'.format(train.shape))
print('Форма test: {}'.format(test.shape))
print('Форма df=train+test: {}'.format(df.shape))

Форма train: (7613, 3)
Форма test: (3263, 2)
Форма df=train+test: (10876, 3)


In [138]:
print('{}'.format(train.info()), end = '\n\n')
print('{}'.format(test.info()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7613 non-null   int64 
 1   text    7613 non-null   object
 2   target  7613 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 178.6+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3263 non-null   int64 
 1   text    3263 non-null   object
dtypes: int64(1), object(1)
memory usage: 51.1+ KB
None


Тестовая выборка меток не имеет, поэтому когда будем анализировать все данные без учета меток будем анализировать df = train + test, когда будет происходить анализ с учетом меток(например, их распределение), то будем использовать только train подвыборку. Пропусков в данных нету.

In [139]:
df.drop_duplicates()
print(f"Размер данных после удаления дубликатов:", df.shape)

Размер данных после удаления дубликатов: (10876, 3)


In [140]:
print(f'Всего твитов в данных:', df.shape[0])
print('Всего уникальных пользователей в данных: {}'.format(df['id'].nunique()))

Всего твитов в данных: 10876
Всего уникальных пользователей в данных: 10876


In [141]:
print('Всего меток твитов в данных: {}'.format(len(train['target'].unique())))
print('Уникальные метки твитов: {}'.format(train['target'].unique()), end = '\n\n')
print('Количество меток каждого типа твитов:\n{}'.format(train['target'].value_counts()))

Всего меток твитов в данных: 2
Уникальные метки твитов: [1 0]

Количество меток каждого типа твитов:
0    4342
1    3271
Name: target, dtype: int64


Количество меток каждого типа твитов резко не отличаются, баланс классов примерно соблюдается.

# Предобработка данных

Итак, имеется исходный датафрейм df, в котором признак text соответствует твиту определенного пользователя, то есть это тип данных object, прежде чем приступить к работе с этими твитами, нужно нормализовать текст и каждый твит представить в виде набора нормализованных слов, а также убрать слишком часто встречающиеся слова и слишком редкие слова. 

## Понижение регистра слов

In [150]:
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

## Исключение бесполезных слов и других бесполезных конструкций

### Исключение чисел и цифр

In [151]:
def number_remove(text):
    line = re.sub('\d+', '', text)
    return line

train['new_text'] = train['text'].apply(lambda x: number_remove(x))
test['new_text'] = test['text'].apply(lambda x: number_remove(x))

### Исключение ссылок на сайты и html ссылок

In [152]:
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)
train['new_text'] = train['new_text'].apply(lambda x:remove_urls(x))
test['new_text'] = test['new_text'].apply(lambda x:remove_urls(x))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

train['new_text']=train['new_text'].apply(lambda x:remove_html(x))
test['new_text']=test['new_text'].apply(lambda x:remove_html(x))

### Исключение пунктуационных знаков

In [153]:
def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct

train['new_text']=train['new_text'].apply(lambda x:punct_remove(x))
test['new_text']=test['new_text'].apply(lambda x:punct_remove(x))

### Исключение стоп-слов(слов, которые почти не характеризуют специфику конкретного текста)

In [154]:
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train['new_text']=train['new_text'].apply(lambda x:remove_stopwords(x))
test['new_text']=test['new_text'].apply(lambda x:remove_stopwords(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Исключение хэштегов(#) и отсылок(@) в твитах

In [155]:
def remove_hash(x):
    text=re.sub(r'#\w+','',x)
    return text
train['new_text']=train['new_text'].apply(lambda x:remove_hash(x))
test['new_text']=test['new_text'].apply(lambda x:remove_hash(x))

def remove_mention(x):
    text=re.sub(r'@\w+','',x)
    return text

train['new_text']=train['new_text'].apply(lambda x:remove_mention(x))
test['new_text']=test['new_text'].apply(lambda x:remove_mention(x))

### Удаление "длинных пробелов"(в случае, если при обработке таковые возникли)

In [156]:
def remove_space(text):
    space_remove = re.sub(r"\s+"," ", text).strip()
    return space_remove

train['new_text']=train['new_text'].apply(lambda x:remove_space(x))
test['new_text']=test['new_text'].apply(lambda x:remove_space(x))

train = train.drop(columns=['text']) 
test = test.drop(columns=['text']) 

## Построение словаря

In [157]:
def build_dictionary(texts):
    
    dictionary = {}
    idx = 0
    
    for text in texts:
        for word in text.split():
            if word not in dictionary.keys():
                dictionary[word] = idx
                idx += 1
                
    return dictionary

In [158]:
def max_sequence_len(texts):
    max_len = 0
    
    for text in texts:
        if len(text.split()) > max_len:
            max_len = len(text.split())
            
    return max_len

In [159]:
dictionary = build_dictionary(train['new_text'])
max_seq_len = max_sequence_len(train['new_text'])

print(f'Размер словаря: {len(dictionary)}')
print(f'Максимальное количество слов в предложении: {max_seq_len}')

Размер словаря: 17110
Максимальное количество слов в предложении: 23


# Построение предсказательной модели

In [160]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [161]:
class CustomDataset(Dataset):
    
    @staticmethod
    def tokenizer(text, max_seq_len, dictionary):
        
        sequences = []
        for x in text:
            sequence = [0] * max_seq_len
            for idx, word in enumerate(x.split()):
                sequence[idx] = dictionary[word]
            sequences.append(sequence)
            
        return np.array(sequences)
    
    def __init__(self, text, target, max_seq_len, dictionary):
        self.x = self.tokenizer(text, max_seq_len, dictionary)
        self.y = target.to_numpy()
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(train["new_text"], train["target"], test_size=0.3, stratify=train["target"])
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [163]:
BATCH_SIZE = 32

train = CustomDataset(X_train, y_train, max_seq_len, dictionary)
valid = CustomDataset(X_valid, y_valid, max_seq_len, dictionary)
test = CustomDataset(X_test, y_test, max_seq_len, dictionary)

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=False)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

In [164]:
class TextClassifier(nn.ModuleList):
    
    """
    LSTM Network definition
    """
    
    def __init__(self, BATCH_SIZE, INPUT_SIZE, HIDDEN_DIM, LSTM_LAYERS, DROPOUT):
        super(TextClassifier, self).__init__()

        self.batch_size = BATCH_SIZE
        self.hidden_dim = HIDDEN_DIM
        self.LSTM_layers = LSTM_LAYERS
        self.input_size = INPUT_SIZE

        self.d = nn.Dropout(DROPOUT)
        
        self.out_act = torch.nn.Sigmoid()
        
        self.embedding = nn.Embedding(num_embeddings=self.input_size, embedding_dim=self.hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
        self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=64)
        self.fc2 = nn.Linear(64, 1)
    

    def forward(self, x):

        h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device)
        c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device)
        
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)
        
        out = self.embedding(x.long())
        out, (hidden, cell) = self.lstm(out, (h, c))
        out = self.d(out)
        out = F.relu(self.fc1(out[:,-1,:]))
        out = self.d(out)
        out = self.out_act(self.fc2(out))

        return out.squeeze()

In [166]:
%%time
net = TextClassifier(32, len(dictionary)+1, 128, 4, 0.2)
net.to(device)
# train_writer.add_graph(net, torch.tensor(X_test), verbose=True)

learning_rate = 0.01 # Гиперпараметр коэффициент обучения
# train_writer.add_text('LERNING_RATE', str(learning_rate))

# Оптимизаторы:
# optimizer = torch.optim.SGD(net.parameters(), lr = learning_rate, momentum = 0.9, dampening=0, weight_decay=0, nesterov = True)
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, betas=(0.9, 0.999), weight_decay=0, eps=1e-10)
# optimizer = torch.optim.AdaGrad(net.parameters(), lr = learning_rate, lr_decay=0, initial_accumulator_value=0, weight_decay=0, eps=1e-06)
# optimizer = torch.optim.AdaDelta(net.parameters(), lr = learning_rate, rho=0.9, eps=1e-06, weight_decay=0)
# optimizer = torch.optim.RMSprop(net.parameters(), lr = learning_rate, alpha=0.99, weight_decay=0, momentum=0, eps=1e-08, centered=False)
# optimizer = torch.optim.Rprop(net.parameters(), lr = learning_rate, etas=(0.5, 1.2), step_sizes=(1e-06, 50))

# Уменьшает learning_rate оптимизатора, если изменение средней оценки на валидационных батчах не более 0.0001
scheduler = ReduceLROnPlateau(optimizer, min_lr = 0.00001, mode = 'min', factor = 0.1, patience = 4, verbose = True)

criterion = nn.BCELoss() # Для задачи бинарной классификации
# criterion = nn.CrossEntropyLoss() # Для задачи многоклассовой классификации
# criterion = nn.MSELoss() # Для задачи регрессии

EPOCHS = 30
step = 0
min_val_score = 0
patience = 0

for epoch in range(EPOCHS):
    epoch_loss = 0
    train_metric_score = 0
    net.train()
    
    for features, label in train_loader:
        optimizer.zero_grad()
        
        features = features.type(torch.LongTensor)
        label = label.type(torch.FloatTensor)
        
        output = net(features.to(device))
        loss = criterion(output, label.to(device).float())
        loss.backward()
        
        
        gini = 2*roc_auc_score(label, output.detach().cpu().numpy()) - 1
        
        # train_writer.add_scalar('CrossEntropyLoss', loss, step)
        # train_writer.add_scalar('Gini', gini, step)
        
        step += 1
        
        # if step % 50 == 0:
        # print('EPOCH %d STEP %d : train_loss: %f train_gini: %f' % (epoch, step, loss, gini))
        
        optimizer.step()
    
    # train_writer.add_histogram('hidden_layer1', net.linear1.weight.data, step) # Логирование распределения весов первого скрытого слоя
    # train_writer.add_histogram('hidden_layer2', net.linear2.weight.data, step) # Логирование распределения весов второго скрытого слоя
    # train_writer.add_histogram('hidden_layer3', net.linear3.weight.data, step) # Логирование распределения весов третьего скрытого слоя
    # train_writer.add_histogram('hidden_layer4', net.linear4.weight.data, step) # Логирование распределения весов четвертого скрытого слоя
    
    net.eval()
    
    train_losses = []
    train_scores = []
    train_metric_score = 0
    with torch.no_grad():
        for features, label in train_loader:
            output = net(features.to(device).float())
            # Calculate error and backpropagate
            loss = criterion(output, label.to(device).float()).detach().cpu().numpy()
            train_losses.append(loss)
            
            train_gini = 2*roc_auc_score(label, output.detach().cpu().numpy()) - 1
            train_scores.append(train_gini)
    
    val_losses = []
    val_scores = []
    val_metric_score = 0
    with torch.no_grad():
        for features, label in valid_loader:
            output = net(features.to(device).float())
            # Calculate error and backpropagate
            loss = criterion(output, label.to(device).float()).detach().cpu().numpy()
            val_losses.append(loss)
            
            valid_gini = 2*roc_auc_score(label, output.detach().cpu().numpy()) - 1
            val_scores.append(valid_gini)
            
    scheduler.step(np.mean(val_losses))
    
    if round(np.mean(val_scores), 4) <= min_val_score:
        print("Early Stopping..")
        break
    
    if min_val_score < round(np.mean(val_scores), 4):
        min_val_score = round(np.mean(val_scores), 4)
        patience = 0
    else:
        patience += 1
        
    if patience > 4:
        print("Early Stopping..")
        break
    
    print('EPOCH %d : train_loss: %f train_gini: %f | valid_loss: %f valid_gini: %f' % (epoch, np.mean(train_losses), np.mean(train_scores), np.mean(val_losses), np.mean(val_scores)))

EPOCH 0 : train_loss: 0.669883 train_gini: 0.236306 | valid_loss: 0.680236 valid_gini: 0.123665
EPOCH 1 : train_loss: 0.644990 train_gini: 0.399235 | valid_loss: 0.677615 valid_gini: 0.179301
EPOCH 2 : train_loss: 0.674712 train_gini: 0.481758 | valid_loss: 0.684514 valid_gini: 0.255199
EPOCH 3 : train_loss: 0.422069 train_gini: 0.800992 | valid_loss: 0.604795 valid_gini: 0.508942
Early Stopping..
CPU times: total: 3min 7s
Wall time: 31.3 s


In [177]:
def get_prediction(model, data_loader):
    Y_true, Y_preds = [], []
    
    for X, Y in data_loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_true.append(Y)
        
    Y_preds, Y_true = torch.cat(Y_preds), torch.cat(Y_true)
    
    return Y_true.detach().numpy(), Y_preds.detach().numpy()


Y_true, Y_preds = get_prediction(net, test_loader)

In [222]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, precision_score, recall_score, classification_report, roc_auc_score

def get_predictions_quality_metrics(Y_true, Y_preds):
    Y_preds_binary = [1 if x > 0.5 else 0 for x in Y_preds]
    
    metrics_binary_tag = ['accuracy_score', 'f1_score', 'log_loss', 'precision_score', 'recall_score']
    metrics_binary_func = [accuracy_score, f1_score, log_loss, precision_score, recall_score]
    metrics_binary_result = {}
    
    for i in range(len(metrics_binary_tag)):
        metrics_binary_result[metrics_binary_tag[i] + "{train}"] = metrics_binary_func[i](Y_preds_binary, Y_true)
        print(metrics_binary_tag[i] + " = " + f"{metrics_binary_func[i](Y_preds_binary, Y_true)}")
        
    print()
    print(classification_report(Y_preds_binary, y_test))
    print()
    
    roc_auc = roc_auc_score(Y_true, Y_preds)
    
    print("roc_auc_score = " + f"{roc_auc}")
    print("gini_score = " + f"{2*roc_auc - 1}")

    return metrics_binary_result, roc_auc

In [223]:
metrics_binary_result, roc_auc_test = get_predictions_quality_metrics(Y_true, Y_preds)

accuracy_score = 0.6882661996497373
f1_score = 0.6596558317399617
log_loss = 11.236025049497115
precision_score = 0.7055214723926381
recall_score = 0.6193895870736086

              precision    recall  f1-score   support

           0       0.68      0.75      0.71       585
           1       0.71      0.62      0.66       557

    accuracy                           0.69      1142
   macro avg       0.69      0.69      0.69      1142
weighted avg       0.69      0.69      0.69      1142


roc_auc_score = 0.7646508015545681
gini_score = 0.5293016031091362
