In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader, stopwords
import fasttext
from convokit import Corpus, download
# from xgboost.sklearn import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import re
# from autosklearn.regression import AutoSklearnRegressor
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from tqdm import tqdm

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# corpus_oanc = PlaintextCorpusReader('./OANC-GrAF', '.*\.txt')

In [5]:
# full_m1 = fasttext.train_unsupervised('corpus_general_plain.txt', model='cbow')
# full_m1.save_model('full_m1.bin')

In [6]:
# full_m2 = fasttext.train_unsupervised('corpus_general_plain.txt', model='skipgram')
# full_m2.save_model('full_m2.bin')

In [7]:
# sample_m1 = fasttext.train_unsupervised('corpus_specialized_plain.txt', model='cbow')
# sample_m1.save_model('sample_m1.bin')

In [8]:
# sample_m2 = fasttext.train_unsupervised('corpus_specialized_plain.txt', model='skipgram')
# sample_m2.save_model('sample_m2.bin')

In [9]:
target = 'target'
threshold = 10
# data = Corpus(download('subreddit-communism')).get_utterances_dataframe()[['text', 'meta.score', 'meta.top_level_comment']]
# print(data['meta.top_level_comment'])
# data = data[~data['meta.top_level_comment'].astype(bool)]
# data['target'] = (data['meta.score'] > threshold).astype(int)
# data.drop(columns=['meta.score', 'meta.top_level_comment'])
# data.to_csv('target_test.csv')
data = pd.read_csv('target_test.csv')
data['text'] = data['text'].astype(str)

In [10]:
data['tokens'] = data['text'].apply(nltk.word_tokenize)

In [11]:
r = re.compile('.*\w.*')
lemmatizer = nltk.WordNetLemmatizer()

def clean_tokens(tokens):
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if r.match(token) and token.lower() not in stopwords.words('english')]
    return tokens

data['tokens'] = data['tokens'].apply(clean_tokens)
data = data.drop(data[data['tokens'].str.len() == 0].index)

In [12]:
def equalize_length(vectors, dst_len=50):
    if len(vectors) < dst_len:
        return vectors + [[0] * 100] * (dst_len - len(vectors))
    elif len(vectors) > dst_len:
        return vectors[:dst_len]
    else:
        return vectors

# def get_vectorizer(model):
#     def get_text_repr(tokens):
#         if tokens:
#             vectors = [model.get_word_vector(word) for word in tokens[:50]]
#         else:
#             vectors = []
#         return np.concatenate(equalize_length(vectors))
#     return get_text_repr

def get_vectorizer(model):
    def get_text_repr(tokens):
        return torch.tensor(np.stack([model.get_word_vector(word) for word in tokens]))
    return get_text_repr

# def prepare_data(data, model):
#     vectorizer = get_vectorizer(model)
#     X = np.stack(data['text'].apply(vectorizer))
#     y = np.array(data[target])
#     mask = np.random.rand(len(X)) < 0.8
#     X_train = X[mask]
#     X_test = X[~mask]
#     y_train = y[mask]
#     y_test = y[~mask]
#     return X_train, y_train, X_test, y_test

def prepare_data(data, model):
    vectorizer = get_vectorizer(model)
    X = data['text'].apply(vectorizer)
    y = data[target]
    mask = np.random.rand(len(X)) < 0.8
    X_train = list(X[mask])
    X_test = list(X[~mask])
    y_train = list(y[mask])
    y_test = list(y[~mask])
    return X_train, y_train, X_test, y_test

In [13]:
# for model_path in ['full_m1.bin', 'full_m2.bin', 'sample_m1.bin', 'sample_m2.bin']:
#     vectorizer = fasttext.FastText.load_model(model_path)
#     X_train, y_train, X_test, y_test = prepare_data(data, vectorizer)
#     regressor = AutoSklearnRegressor(memory_limit=10240)
#     regressor.fit(X_train, y_train, X_test, y_test)
#     y_pred = regressor.predict(X_test)
#     print(model_path)
#     print('\tmse:\t', mean_squared_error(y_test, y_pred))
#     print('\tr2:\t', r2_score(y_test, y_pred))

In [14]:
# class score_predictor(nn.Module):

#     def __init__(self) -> None:
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.ReLU(),
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.ReLU(),
#             nn.Conv1d(100, 100, kernel_size=5),
#             nn.Flatten(),
#             nn.Linear(8800, 4400),
#             nn.ReLU(),
#             nn.Linear(4400, 2200),
#             nn.ReLU(),
#             nn.Linear(2200, 1100),
#             nn.ReLU(),
#             nn.Linear(1100, 1)
#         )
    
#     def forward(self, x):
#         return self.net(x)

In [15]:
class MyDataset(Dataset):

    def __init__(self, X, y) -> None:
        super().__init__()
        self.data = list(zip(X, y))

    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

class MyNet(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self.rnn = nn.RNN(100, 100, 1, batch_first=True)
        self.reg = nn.Linear(100, 2)
    
    def forward(self, x, x_lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True, enforce_sorted=False)
        output, _ = self.rnn(packed)
        unpacked, unpacked_len = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        indices = Variable(torch.LongTensor(np.array(unpacked_len) - 1).view(-1, 1)
                                                                       .expand(unpacked.size(0), unpacked.size(2))
                                                                       .unsqueeze(1)
                                                                       .to(device))
        last_encoded_states = unpacked.gather(dim=1, index=indices).squeeze(dim=1)
        return self.reg(last_encoded_states)


def my_collate(batch):
    X_batch, y_batch = zip(*batch)
    lengths = [sample.shape[0] for sample in X_batch]
    X_batch = nn.utils.rnn.pad_sequence(X_batch, batch_first=True)
    return X_batch, torch.tensor(y_batch), lengths

def to_dl(X, y):
    ds = MyDataset(X, y)
    dl = DataLoader(ds, 128, shuffle=True, collate_fn=my_collate, pin_memory=True)
    return dl

In [16]:
def validate(model, dl, loss_fn):
    y_pred = []
    y_true = []
    for X_batch, y_batch, lengths in dl:
        X_batch = X_batch.to(device)
        y_true.append(y_batch)
        y_pred.append(model(X_batch, lengths).cpu())
    y_pred = torch.concat(y_pred)
    y_true = torch.concat(y_true)
    return f1_score(y_true, torch.argmax(y_pred, axis=1)), loss_fn(y_pred, y_true)

def fit(model, loss_fn, optimizer, train_dl, val_dl, epochs=50, show_metrics=True):
    for epoch in range(epochs):
        for X_batch, y_batch, lengths in tqdm(train_dl):
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(X_batch, lengths)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if show_metrics:
            with torch.no_grad():
                train_mse, train_r2 = validate(model, train_dl, loss_fn)
                val_mse, val_r2 = validate(model, val_dl, loss_fn)
                print(f'Epoch: {epoch}\ttrain: f1 = {train_mse} loss = {train_r2}\tval: f1 = {val_mse} loss = {val_r2}')


In [17]:
data['target'].value_counts()

0    20949
1    13328
Name: target, dtype: int64

In [18]:
for model_path in ['full_m1.bin', 'full_m2.bin', 'sample_m1.bin', 'sample_m2.bin']:
    vectorizer = fasttext.FastText.load_model(model_path)
    X_train, y_train, X_test, y_test = prepare_data(data, vectorizer)
    train_dl = to_dl(X_train, y_train)
    test_dl = to_dl(X_test, y_test)
    model = MyNet().to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    print('\n', model_path, '\n')
    fit(model, loss_fn, optimizer, train_dl, test_dl)

  0%|          | 0/215 [00:00<?, ?it/s]


 full_m1.bin 



100%|██████████| 215/215 [00:57<00:00,  3.77it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 0	train: acc = 0.2544028520499109 loss = 0.6063907146453857	val: acc = 0.2438884095484613 loss = 0.6059524416923523


100%|██████████| 215/215 [00:54<00:00,  3.93it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 1	train: acc = 0.14112639161755075 loss = 0.6078600883483887	val: acc = 0.13060686015831136 loss = 0.607206404209137


100%|██████████| 215/215 [00:55<00:00,  3.90it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 2	train: acc = 0.10588834210079755 loss = 0.6074954271316528	val: acc = 0.10944935418082938 loss = 0.6060925722122192


100%|██████████| 215/215 [00:55<00:00,  3.89it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 3	train: acc = 0.17404410598127876 loss = 0.6058607697486877	val: acc = 0.15067611075338055 loss = 0.6063243746757507


100%|██████████| 215/215 [00:56<00:00,  3.83it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 4	train: acc = 0.22072577628133183 loss = 0.6099084615707397	val: acc = 0.19672131147540983 loss = 0.6113730669021606


100%|██████████| 215/215 [00:54<00:00,  3.92it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 5	train: acc = 0.5874631663541388 loss = 0.6080914735794067	val: acc = 0.5867490928638625 loss = 0.6091853976249695


100%|██████████| 215/215 [00:53<00:00,  4.05it/s]
  0%|          | 1/215 [00:00<00:28,  7.41it/s]

Epoch: 6	train: acc = 0.25736345438947666 loss = 0.6041103005409241	val: acc = 0.23230280265819128 loss = 0.607782781124115


100%|██████████| 215/215 [00:55<00:00,  3.88it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 7	train: acc = 0.2576349688811695 loss = 0.6021643877029419	val: acc = 0.2283211678832117 loss = 0.6058568954467773


100%|██████████| 215/215 [00:53<00:00,  4.06it/s]
  0%|          | 1/215 [00:00<00:23,  9.17it/s]

Epoch: 8	train: acc = 0.2880505737648595 loss = 0.6032177209854126	val: acc = 0.2731034482758621 loss = 0.6080216765403748


100%|██████████| 215/215 [00:53<00:00,  3.99it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 9	train: acc = 0.1263550135501355 loss = 0.6014016270637512	val: acc = 0.10910330719399931 loss = 0.6058684587478638


100%|██████████| 215/215 [00:51<00:00,  4.16it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 10	train: acc = 0.22593379896750682 loss = 0.6019120812416077	val: acc = 0.19182293267752845 loss = 0.6086992621421814


100%|██████████| 215/215 [00:51<00:00,  4.16it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 11	train: acc = 0.15848561829058427 loss = 0.5999478697776794	val: acc = 0.128500823723229 loss = 0.6082834005355835


100%|██████████| 215/215 [00:52<00:00,  4.07it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 12	train: acc = 0.2546761534511846 loss = 0.6011680364608765	val: acc = 0.22051433638782145 loss = 0.6097519993782043


100%|██████████| 215/215 [00:52<00:00,  4.06it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 13	train: acc = 0.2883221850613155 loss = 0.5983582735061646	val: acc = 0.26065022421524664 loss = 0.6091066002845764


100%|██████████| 215/215 [00:51<00:00,  4.16it/s]
  1%|          | 2/215 [00:00<00:16, 12.87it/s]

Epoch: 14	train: acc = 0.15001245950660352 loss = 0.5981571674346924	val: acc = 0.12212212212212212 loss = 0.6093462705612183


100%|██████████| 215/215 [00:51<00:00,  4.16it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 15	train: acc = 0.24540702016430171 loss = 0.5985466837882996	val: acc = 0.20078384082001807 loss = 0.6140167117118835


100%|██████████| 215/215 [00:52<00:00,  4.10it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 16	train: acc = 0.11911955114372033 loss = 0.5960961580276489	val: acc = 0.08510638297872342 loss = 0.6110628247261047


100%|██████████| 215/215 [00:53<00:00,  4.01it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 17	train: acc = 0.26473811089195703 loss = 0.5945227146148682	val: acc = 0.21715976331360945 loss = 0.6113741397857666


100%|██████████| 215/215 [00:53<00:00,  4.02it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 18	train: acc = 0.2618679620225215 loss = 0.5945419669151306	val: acc = 0.2060931899641577 loss = 0.6139510869979858


100%|██████████| 215/215 [00:51<00:00,  4.14it/s]
  1%|          | 2/215 [00:00<00:15, 13.42it/s]

Epoch: 19	train: acc = 0.24988797610156835 loss = 0.5951078534126282	val: acc = 0.19975859987929995 loss = 0.6157500147819519


100%|██████████| 215/215 [00:49<00:00,  4.35it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 20	train: acc = 0.23826166236134327 loss = 0.5937170386314392	val: acc = 0.18889911070223858 loss = 0.6143258213996887


100%|██████████| 215/215 [00:51<00:00,  4.20it/s]
  0%|          | 1/215 [00:00<00:31,  6.80it/s]

Epoch: 21	train: acc = 0.2320061255742726 loss = 0.5901762843132019	val: acc = 0.18092307692307694 loss = 0.6137416958808899


100%|██████████| 215/215 [00:49<00:00,  4.34it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 22	train: acc = 0.246504949746845 loss = 0.590142011642456	val: acc = 0.19388379204892967 loss = 0.6140758991241455


100%|██████████| 215/215 [00:52<00:00,  4.11it/s]
  0%|          | 1/215 [00:00<00:29,  7.25it/s]

Epoch: 23	train: acc = 0.2465319662243667 loss = 0.58939528465271	val: acc = 0.1875574624578609 loss = 0.6189180612564087


100%|██████████| 215/215 [00:51<00:00,  4.19it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 24	train: acc = 0.24956488838441168 loss = 0.5883033871650696	val: acc = 0.18918918918918917 loss = 0.6176667213439941


100%|██████████| 215/215 [00:52<00:00,  4.09it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 25	train: acc = 0.26462313046053604 loss = 0.5879297256469727	val: acc = 0.20282876918447187 loss = 0.6173470616340637


100%|██████████| 215/215 [00:52<00:00,  4.10it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 26	train: acc = 0.2909116951944425 loss = 0.5860984921455383	val: acc = 0.23070251517779702 loss = 0.6185218691825867


100%|██████████| 215/215 [00:53<00:00,  4.04it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 27	train: acc = 0.2264740335806326 loss = 0.5868477821350098	val: acc = 0.1564885496183206 loss = 0.6205868721008301


100%|██████████| 215/215 [00:52<00:00,  4.13it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 28	train: acc = 0.24152509800907063 loss = 0.5851436853408813	val: acc = 0.17809494260006203 loss = 0.6176840662956238


100%|██████████| 215/215 [00:54<00:00,  3.97it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 29	train: acc = 0.6302839116719243 loss = 0.5869606733322144	val: acc = 0.6043384498287454 loss = 0.6247021555900574


100%|██████████| 215/215 [00:51<00:00,  4.14it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 30	train: acc = 0.6231679094883004 loss = 0.5899041891098022	val: acc = 0.595651048407973 loss = 0.6294434666633606


100%|██████████| 215/215 [00:53<00:00,  4.01it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 31	train: acc = 0.24397752636034786 loss = 0.5825608372688293	val: acc = 0.16791979949874686 loss = 0.6238933801651001


100%|██████████| 215/215 [00:51<00:00,  4.18it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 32	train: acc = 0.27311859878284106 loss = 0.5844873785972595	val: acc = 0.2028811524609844 loss = 0.6274076700210571


100%|██████████| 215/215 [00:53<00:00,  4.05it/s]
  0%|          | 1/215 [00:00<00:28,  7.40it/s]

Epoch: 33	train: acc = 0.25842611527683107 loss = 0.5803289413452148	val: acc = 0.1814254859611231 loss = 0.6259216070175171


100%|██████████| 215/215 [00:53<00:00,  4.00it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 34	train: acc = 0.6280177789147189 loss = 0.5852150321006775	val: acc = 0.5999742168364058 loss = 0.6324883103370667


100%|██████████| 215/215 [00:52<00:00,  4.10it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 35	train: acc = 0.631914624905279 loss = 0.5834996104240417	val: acc = 0.604521209042418 loss = 0.6346371173858643


100%|██████████| 215/215 [00:53<00:00,  3.98it/s]
  0%|          | 1/215 [00:00<00:26,  8.19it/s]

Epoch: 36	train: acc = 0.6333502024291499 loss = 0.5819883346557617	val: acc = 0.602467879404656 loss = 0.6353491544723511


100%|██████████| 215/215 [00:53<00:00,  4.05it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 37	train: acc = 0.2800942979224989 loss = 0.5790876150131226	val: acc = 0.21212121212121213 loss = 0.630531907081604


100%|██████████| 215/215 [00:52<00:00,  4.09it/s]
  0%|          | 1/215 [00:00<00:35,  5.99it/s]

Epoch: 38	train: acc = 0.28308958624237524 loss = 0.5796370506286621	val: acc = 0.20736779560308974 loss = 0.633292019367218


100%|██████████| 215/215 [00:51<00:00,  4.16it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 39	train: acc = 0.27702351806513836 loss = 0.578331708908081	val: acc = 0.20227408737283065 loss = 0.6318071484565735


100%|██████████| 215/215 [00:51<00:00,  4.15it/s]
  0%|          | 1/215 [00:00<00:32,  6.55it/s]

Epoch: 40	train: acc = 0.6276524028121088 loss = 0.5827762484550476	val: acc = 0.596124031007752 loss = 0.6420694589614868


100%|██████████| 215/215 [00:51<00:00,  4.19it/s]
  0%|          | 1/215 [00:00<00:29,  7.17it/s]

Epoch: 41	train: acc = 0.2566542750929368 loss = 0.593201220035553	val: acc = 0.2054054054054054 loss = 0.639641284942627


100%|██████████| 215/215 [00:50<00:00,  4.27it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 42	train: acc = 0.2531454833119868 loss = 0.5886709690093994	val: acc = 0.19065077910174152 loss = 0.6284730434417725


100%|██████████| 215/215 [00:50<00:00,  4.23it/s]
  0%|          | 1/215 [00:00<00:22,  9.46it/s]

Epoch: 43	train: acc = 0.25806451612903225 loss = 0.5836211442947388	val: acc = 0.20108695652173914 loss = 0.6270461082458496


100%|██████████| 215/215 [00:51<00:00,  4.19it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 44	train: acc = 0.2879184861717613 loss = 0.5805403590202332	val: acc = 0.2207868467410452 loss = 0.6291257739067078


100%|██████████| 215/215 [00:52<00:00,  4.08it/s]
  0%|          | 1/215 [00:00<00:26,  8.03it/s]

Epoch: 45	train: acc = 0.2725228601409084 loss = 0.5787485837936401	val: acc = 0.19085365853658537 loss = 0.632729172706604


100%|██████████| 215/215 [00:51<00:00,  4.14it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 46	train: acc = 0.26320175770891735 loss = 0.5767174959182739	val: acc = 0.18600368324125233 loss = 0.6330915093421936


100%|██████████| 215/215 [00:51<00:00,  4.15it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 47	train: acc = 0.28743129351410773 loss = 0.5812234282493591	val: acc = 0.21523081446633344 loss = 0.640630304813385


100%|██████████| 215/215 [00:53<00:00,  4.04it/s]
  0%|          | 0/215 [00:00<?, ?it/s]

Epoch: 48	train: acc = 0.2806365258774539 loss = 0.5744100213050842	val: acc = 0.2044977511244378 loss = 0.637283980846405


100%|██████████| 215/215 [00:52<00:00,  4.09it/s]


Epoch: 49	train: acc = 0.2353207205849099 loss = 0.6094543933868408	val: acc = 0.19382451849587282 loss = 0.6352572441101074




KeyboardInterrupt: 