In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader, stopwords
import fasttext
# from convokit import Corpus, download
# from xgboost.sklearn import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import re
# from autosklearn.regression import AutoSklearnRegressor
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# corpus_oanc = PlaintextCorpusReader('./OANC-GrAF', '.*\.txt')

In [5]:
# full_m1 = fasttext.train_unsupervised('corpus_general_plain.txt', model='cbow')
# full_m1.save_model('full_m1.bin')

In [6]:
# full_m2 = fasttext.train_unsupervised('corpus_general_plain.txt', model='skipgram')
# full_m2.save_model('full_m2.bin')

In [7]:
# sample_m1 = fasttext.train_unsupervised('corpus_specialized_plain.txt', model='cbow')
# sample_m1.save_model('sample_m1.bin')

In [8]:
# sample_m2 = fasttext.train_unsupervised('corpus_specialized_plain.txt', model='skipgram')
# sample_m2.save_model('sample_m2.bin')

In [9]:
data = pd.read_csv('corpus_specialized_annotated.csv').sample(50000)
data['text'] = data['text'].astype(str)

In [10]:
data['tokens'] = data['text'].apply(nltk.word_tokenize)

In [11]:
r = re.compile('.*\w.*')
lemmatizer = nltk.WordNetLemmatizer()

def clean_tokens(tokens):
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if r.match(token) and token.lower() not in stopwords.words('english')]
    return tokens

data['tokens'] = data['tokens'].apply(clean_tokens)
data = data.drop(data[data['tokens'].str.len() == 0].index)

In [12]:
def equalize_length(vectors, dst_len=50):
    if len(vectors) < dst_len:
        return vectors + [[0] * 100] * (dst_len - len(vectors))
    elif len(vectors) > dst_len:
        return vectors[:dst_len]
    else:
        return vectors

# def get_vectorizer(model):
#     def get_text_repr(tokens):
#         if tokens:
#             vectors = [model.get_word_vector(word) for word in tokens[:50]]
#         else:
#             vectors = []
#         return np.concatenate(equalize_length(vectors))
#     return get_text_repr

def get_vectorizer(model):
    def get_text_repr(tokens):
        return torch.tensor(np.stack([model.get_word_vector(word) for word in tokens]))
    return get_text_repr

# def prepare_data(data, model):
#     vectorizer = get_vectorizer(model)
#     X = np.stack(data['text'].apply(vectorizer))
#     y = np.array(data['meta.score'])
#     mask = np.random.rand(len(X)) < 0.8
#     X_train = X[mask]
#     X_test = X[~mask]
#     y_train = y[mask]
#     y_test = y[~mask]
#     return X_train, y_train, X_test, y_test

def prepare_data(data, model):
    vectorizer = get_vectorizer(model)
    X = data['text'].apply(vectorizer)
    y = data['meta.score']
    mask = np.random.rand(len(X)) < 0.8
    X_train = list(X[mask])
    X_test = list(X[~mask])
    y_train = list(y[mask])
    y_test = list(y[~mask])
    return X_train, y_train, X_test, y_test

In [13]:
# for model_path in ['full_m1.bin', 'full_m2.bin', 'sample_m1.bin', 'sample_m2.bin']:
#     vectorizer = fasttext.FastText.load_model(model_path)
#     X_train, y_train, X_test, y_test = prepare_data(data, vectorizer)
#     regressor = AutoSklearnRegressor(memory_limit=10240)
#     regressor.fit(X_train, y_train, X_test, y_test)
#     y_pred = regressor.predict(X_test)
#     print(model_path)
#     print('\tmse:\t', mean_squared_error(y_test, y_pred))
#     print('\tr2:\t', r2_score(y_test, y_pred))

In [14]:
# class score_predictor(nn.Module):

#     def __init__(self) -> None:
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.ReLU(),
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.ReLU(),
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.Flatten(),
#             nn.Linear(8800, 4400),
#             nn.ReLU(),
#             nn.Linear(4400, 2200),
#             nn.ReLU(),
#             nn.Linear(2200, 1100),
#             nn.ReLU(),
#             nn.Linear(1100, 1)
#         )
    
#     def forward(self, x):
#         return self.net(x)

In [28]:
class MyDataset(Dataset):

    def __init__(self, X, y) -> None:
        super().__init__()
        self.data = list(zip(X, y))

    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

class MyNet(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self.rnn = nn.RNN(100, 100, 3, batch_first=True)
        self.reg = nn.Linear(100, 1)
    
    def forward(self, x, x_lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        output, _ = self.rnn(packed)
        unpacked, unpacked_len = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        indices = Variable(torch.LongTensor(np.array(unpacked_len) - 1).view(-1, 1)
                                                                       .expand(unpacked.size(0), unpacked.size(2))
                                                                       .unsqueeze(1)
                                                                       .to(device))
        last_encoded_states = unpacked.gather(dim=1, index=indices).squeeze(dim=1)
        return self.reg(last_encoded_states)


def my_collate(batch):
    X_batch, y_batch = zip(*batch)
    lengths = [sample.shape[0] for sample in X_batch]
    X_batch = nn.utils.rnn.pad_sequence(X_batch, batch_first=True)
    return X_batch, torch.tensor(y_batch, dtype=torch.float).unsqueeze(1), lengths

def to_dl(X, y):
    ds = MyDataset(X, y)
    dl = DataLoader(ds, 128, shuffle=True, collate_fn=my_collate, pin_memory=True)
    return dl

In [22]:
def validate(model, dl):
    y_pred = []
    y_true = []
    for X_batch, y_batch, lengths in dl:
        X_batch = X_batch.to(device)
        y_true.append(y_batch)
        y_pred.append(model(X_batch, lengths).cpu())
    y_pred = torch.concat(y_pred)
    y_true = torch.concat(y_true)
    return mean_squared_error(y_true, y_pred), r2_score(y_true, y_pred)

def fit(model, loss_fn, optimizer, train_dl, val_dl, epochs=50, show_metrics=True):
    for epoch in range(epochs):
        for X_batch, y_batch, lengths in train_dl:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(X_batch, lengths)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if show_metrics:
            with torch.no_grad():
                train_mse, train_r2 = validate(model, train_dl)
                val_mse, val_r2 = validate(model, val_dl)
                print(f'Epoch: {epoch}\ttrain: MSE = {train_mse} R2 = {train_r2}\tval: MSE = {val_mse} R2 = {val_r2}')


In [29]:
for model_path in ['full_m1.bin', 'full_m2.bin', 'sample_m1.bin', 'sample_m2.bin']:
    vectorizer = fasttext.FastText.load_model(model_path)
    X_train, y_train, X_test, y_test = prepare_data(data, vectorizer)
    train_dl = to_dl(X_train, y_train)
    test_dl = to_dl(X_test, y_test)
    model = MyNet().to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    print('\n', model_path, '\n')
    fit(model, loss_fn, optimizer, train_dl, test_dl)




 full_m1.bin 



  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


AttributeError: __enter__