# Лабораторная работа №2

## Импорты

In [None]:
import re

import gensim.downloader
import nltk
import numpy as np
import pandas as pd
import torch
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
DATASET_PATH = "/content/drive/MyDrive/Университет/NLP/lab2-rnn_many2one/emotion-labels.csv"

## Загрузка и просмотр датасета

In [None]:
df = pd.read_csv(DATASET_PATH)
display(df.head(7), df.shape)

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy
5,"Today I reached 1000 subscribers on YT!! , #go...",joy
6,"@Singaholic121 Good morning, love! Happy first...",joy


(3613, 2)

In [None]:
df['label'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
fear,0.317465
anger,0.237199
joy,0.227789
sadness,0.217548


## Предобработка тренировочной выборки

In [None]:
df = df[~df['text'].duplicated()].reset_index(drop=True)
df.head(7)

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy
5,"Today I reached 1000 subscribers on YT!! , #go...",joy
6,"@Singaholic121 Good morning, love! Happy first...",joy


In [None]:
def preprocess(doc: str) -> str:
    doc = doc.lower()

    doc = re.sub(r'\s@\w+', '', ' ' + doc)

    doc = re.sub(r'\d+', '', doc)

    doc = doc.replace(r'\n', '')

    doc = re.sub(r'[^\w\s]', '', doc)

    return doc

In [None]:
df['text'] = df['text'].apply(preprocess)
df.head(7)

Unnamed: 0,text,label
0,just got back from seeing in burslem amazing ...,joy
1,oh dear an evening of absolute hilarity i don...,joy
2,been waiting all week for this game cheer fr...,joy
3,thank you so much gloria youre so sweet and ...,joy
4,i feel so blessed to work with the family tha...,joy
5,today i reached subscribers on yt goodday t...,joy
6,good morning love happy first day of fall let...,joy


## Токенизация

In [None]:
def tokenize(doc: str) -> list[str]:
    return word_tokenize(doc)

In [None]:
df['text'] = df['text'].apply(tokenize)
df.head(7)

Unnamed: 0,text,label
0,"[just, got, back, from, seeing, in, burslem, a...",joy
1,"[oh, dear, an, evening, of, absolute, hilarity...",joy
2,"[been, waiting, all, week, for, this, game, ch...",joy
3,"[thank, you, so, much, gloria, youre, so, swee...",joy
4,"[i, feel, so, blessed, to, work, with, the, fa...",joy
5,"[today, i, reached, subscribers, on, yt, goodd...",joy
6,"[good, morning, love, happy, first, day, of, f...",joy


## Лемматизация

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize(doc: list[str]) -> list[str]:
    return [lemmatizer.lemmatize(word) for word in doc]

In [None]:
df['text'] = df['text'].apply(lemmatize)
df.head(7)

Unnamed: 0,text,label
0,"[just, got, back, from, seeing, in, burslem, a...",joy
1,"[oh, dear, an, evening, of, absolute, hilarity...",joy
2,"[been, waiting, all, week, for, this, game, ch...",joy
3,"[thank, you, so, much, gloria, youre, so, swee...",joy
4,"[i, feel, so, blessed, to, work, with, the, fa...",joy
5,"[today, i, reached, subscriber, on, yt, goodda...",joy
6,"[good, morning, love, happy, first, day, of, f...",joy


## Удаление стоп-слов

In [None]:
stop_words = stopwords.words('english')

def stop_words_drop(doc: list[str]) -> list[str]:
    return [word for word in doc if word not in stop_words]

In [None]:
df['text'] = df['text'].apply(stop_words_drop)
df.head(7)

Unnamed: 0,text,label
0,"[got, back, seeing, burslem, amazing, face, st...",joy
1,"[oh, dear, evening, absolute, hilarity, dont, ...",joy
2,"[waiting, week, game, cheer, friday]",joy
3,"[thank, much, gloria, youre, sweet, thoughtful...",joy
4,"[feel, blessed, work, family, nanny, nothing, ...",joy
5,"[today, reached, subscriber, yt, goodday, than...",joy
6,"[good, morning, love, happy, first, day, fall,...",joy


## Векторизация корпуса

Перемешивание датасета

In [None]:
df = df.sample(frac=1)

Разделяем данные на признаки и целевую переменную

In [None]:
x, y = df['text'], df['label']

In [None]:
model = gensim.downloader.load('fasttext-wiki-news-subwords-300')



In [None]:
trained_model = Word2Vec(sentences=x.to_list(), vector_size=300, window=5, min_count=1, workers=4)

In [None]:
features = np.zeros((x.shape[0], 300))
for i in range(x.shape[0]):
    for word in x[i]:
        try:
            features[i] += model[word]
        except KeyError:
            features[i] += trained_model.wv[word]
    features[i] /= len(x[i])


In [None]:
x_torch = torch.from_numpy(features)
x_torch.shape

torch.Size([3565, 300])

In [None]:
onehot_y = pd.get_dummies(y, dtype=np.float32)
y_torch = torch.from_numpy(onehot_y.values)
display(y_torch[:5], y_torch.shape)

tensor([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]])

torch.Size([3565, 4])

## Разделение на тренировочную и тестовую выборки

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x_torch, y_torch, test_size=0.15)
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

(torch.Size([3030, 300]),
 torch.Size([535, 300]),
 torch.Size([3030, 4]),
 torch.Size([535, 4]))

## Сохранение тензоров

In [None]:
SAVE_PATH = '/content/drive/MyDrive/Университет/NLP/lab2-rnn_many2one/tensors1.pt'

In [None]:
tensors = {
    'xtrain': xtrain,
    'ytrain': ytrain,
    'xtest': xtest,
    'ytest': ytest
}
torch.save(tensors, SAVE_PATH)