In [1]:
from Utils import *
import altair as alt
import numpy as np
import pandas as pd
import tqdm
import NumpyLR
import TorchLR

## Load and tokenize the data

In [3]:
train_df = load_df("train.csv")
dev_df = load_df("dev.csv")
test_df = load_df("test.csv")
# use 2-grams can achieve best performance
tokenizer = Tokenizer(1)
processor = Preprocessor(tokenizer)
processor.buildVocabulary(train_df)
train_X = processor.buildMatrix(train_df)
train_Y = train_df["party_affiliation"].values


Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:32<00:00, 1846.16it/s]
Build Matrix: 59999it [00:32, 1843.77it/s]


## Numpy Training

In [None]:
numpy_model = NumpyLR.NumpyLR(processor)
# Set the batch size to be 1e4 to achieve high performance
# NumpyLR.train(train_X, train_Y, numpy_model, lr=5e-4, num_step=3e3, batch_size=10000, fig_path="NumpyLR.png")
likelihood = NumpyLR.train(train_X, train_Y, numpy_model, lr=5e-5, num_step=1e3, batch_size=1, fig_path="NumpyLR.png")
score = evaluate(numpy_model, dev_df, dev_df["party_affiliation"].values)
score

In [None]:
NumpyModel.get_token_weights()

In [None]:
csv = merge_df(test_df, output)
csv.to_csv("TorchLR.csv")
NumpyModel.save("/home/panda/models/NumpyLR{}.pkl".format(1000 * round(score["F1"], 4))

## PyTorch Training

#### Test the regularization

In [None]:
losses = list()
F1 = list()
lr = 1e-3
for l2 in [0, 1e-3, 0.1]:
    TorchModel = TorchLR.LogisticRegression(processor)
    optimizer = torch.optim.SGD(TorchModel.parameters(), lr=lr, weight_decay=l2)
    score = TorchLR.train(train_X, train_Y, TorchModel, lr=lr, epoch_num=1, epoch_step=60)
    losses.append(score["loss"])
    F1.append(score["F1"])

#### Test the optimizer type

In [None]:
losses = list()
F1 = list()
lr = 1e-3
for optimizer in [torch.optim.SGD, torch.optim.RMSprop, torch.optim.AdamW]:
    TorchModel = TorchLR.LogisticRegression(processor)
    optimizer = optimizer(TorchModel.parameters(), lr=lr)
    score = TorchLR.train(train_X, train_Y, TorchModel, lr=lr, epoch_num=1, epoch_step=60)
    losses.append(score["loss"])
    F1.append(score["F1"])

#### Different tokenizer

In [4]:
losses = list()
F1 = list()
for level in [0, 1]:
    tokenizer = Tokenizer(level)
    processor = Preprocessor(tokenizer)
    processor.buildVocabulary(train_df)
    train_X = processor.buildMatrix(train_df)
    TorchModel = TorchLR.LogisticRegression(processor)
    score = TorchLR.train(train_X, train_Y, TorchModel, lr=1e-3, epoch_num=1, epoch_step=60)
    losses.append(score["loss"])
    F1.append(score["F1"])

Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:06<00:00, 9923.27it/s]
Build Matrix: 59999it [00:08, 7272.58it/s]
Training Epoch:   0%|                                                                                        | 0/1 [00:00<?, ?it/s]
Training Step in Epoch 0:   0%|                                                                             | 0/60 [00:00<?, ?it/s][A
Training Step in Epoch 0:   5%|███▍                                                                 | 3/60 [00:00<00:01, 29.31it/s][A
Training Step in Epoch 0:  10%|██████▉                                                              | 6/60 [00:00<00:01, 28.93it/s][A
Training Step in Epoch 0:  15%|██████████▎                                                          | 9/60 [00:00<00:01, 28.48it/s][A
Training Step in Epoch 0:  20%|█████████████▌                                                      | 12/60 [00:00<00:01, 28.62it/s][A
Training Step in E

#### Different learning rate

In [11]:
losses = list()
F1 = list()
tokenizer = Tokenizer(1)
processor = Preprocessor(tokenizer)
processor.buildVocabulary(train_df)
train_X = processor.buildMatrix(train_df)
for lr in [1e-3, 5e-5, 1e-6]:
    TorchModel = TorchLR.LogisticRegression(processor)
    score = TorchLR.train(train_X, train_Y, TorchModel, lr=lr, epoch_num=1, epoch_step=60)
    losses.append(score["loss"])
    F1.append(score["F1"])

Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:32<00:00, 1874.80it/s]
Build Matrix: 59999it [00:34, 1742.07it/s]
Training Epoch:   0%|                                                                                        | 0/1 [00:00<?, ?it/s]
Training Step in Epoch 0:   0%|                                                                             | 0/60 [00:00<?, ?it/s][A
Training Step in Epoch 0:  10%|██████▉                                                              | 6/60 [00:00<00:00, 59.77it/s][A
Training Step in Epoch 0:  20%|█████████████▌                                                      | 12/60 [00:00<00:00, 50.16it/s][A
Training Step in Epoch 0:  30%|████████████████████▍                                               | 18/60 [00:00<00:00, 49.68it/s][A
Training Step in Epoch 0:  40%|███████████████████████████▏                                        | 24/60 [00:00<00:00, 49.35it/s][A
Training Step in E

#### Plot Loss

In [12]:
loss_df = pd.DataFrame(losses).T
# loss_df.columns = ["0", "1e-3", "0.1"]
# loss_df.columns = ["SGD", "RMSprop", "AdamW"]
# loss_df.columns = ["tokenizer", "better_tokenizer"]
loss_df.columns = ["1e-3", "5e-5", "1e-6"]
loss_df = loss_df.melt(ignore_index=False).reset_index()
alt.Chart(loss_df).mark_line().encode(
    x = alt.X("index:Q"),
    y = alt.Y("value:Q"),
    color = alt.Color("variable:N")
)

#### Plot F1

In [13]:
F1_df = pd.DataFrame(F1).T
# F1_df.columns = ["0", "1e-3", "1e-1"]
# F1_df.columns = ["SGD", "RMSprop", "AdamW"]
# F1_df.columns = ["tokenizer", "better_tokenizer"]
F1_df.columns = ["1e-3", "5e-5", "1e-6"]
F1_df = F1_df.melt(ignore_index=False).reset_index()
alt.Chart(F1_df).mark_line().encode(
    x = alt.X("index:Q"),
    y = alt.Y("value:Q"),
    color = alt.Color("variable:N")
)

# N-gram size

In [13]:
%%time
result = dict()
for min_freq in [5, 10, 100]:
    result[min_freq] = dict()
    for i in range(1, 4):
        info = dict()
        tokenizer = Tokenizer(i)
        train_processor = Preprocessor(tokenizer, min_frequency=min_freq)
        train_processor.buildVocabulary(train_df)
        train_X = train_processor.buildMatrix(train_df)
        info["matrix"] = train_X
        freq = np.sum(train_X, axis=0)
        info["size"] = len(train_processor.voc.values())
        info["max"], info["min"], info["mean"] = np.max(freq), np.min(freq), np.mean(freq)
        values = np.array(train_processor.voc.values())
        dev_processor = Preprocessor(tokenizer, min_frequency=min_freq)
        dev_processor.buildVocabulary(dev_df)
        info["pec"] = len(set(dev_processor.voc.values()) & set(train_processor.voc.values())) / len(train_processor.voc)
        result[min_freq][i] = info

Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:36<00:00, 1633.09it/s]
Build Matrix: 59999it [00:36, 1632.70it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:11<00:00, 1692.60it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:35<00:00, 1673.38it/s]
Build Matrix: 59999it [00:38, 1540.83it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:12<00:00, 1643.82it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 59999/59999 [00:37<00:00, 1608.88it/s]
Build Matrix: 59999it [00:40, 1482.26it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:12<00:00, 1615.55it/s]
Build Voc: 100%|███████████████████████████████████████████████████████████████

CPU times: user 13min 13s, sys: 4.61 s, total: 13min 17s
Wall time: 13min 9s


In [12]:
len(set(dev_processor.voc.values()) & set(train_processor.voc.values())) / len(train_processor.voc)
len(set(dev_processor.voc.values()) & set(train_processor.voc.values()))
len(dev_processor.voc)

7052

In [14]:
result

{5: {1: {'matrix': <59999x15845 sparse matrix of type '<class 'numpy.int64'>'
   	with 9575252 stored elements in Compressed Sparse Row format>,
   'size': 15844,
   'max': 154362,
   'min': 5,
   'mean': 604.3074786999053,
   'pec': 0.6718000504923},
  2: {'matrix': <59999x207860 sparse matrix of type '<class 'numpy.int64'>'
   	with 7965423 stored elements in Compressed Sparse Row format>,
   'size': 207859,
   'max': 59999,
   'min': 5,
   'mean': 38.321095929952854,
   'pec': 0.38429416094564106},
  3: {'matrix': <59999x215576 sparse matrix of type '<class 'numpy.int64'>'
   	with 5310107 stored elements in Compressed Sparse Row format>,
   'size': 215575,
   'max': 59999,
   'min': 5,
   'mean': 24.632180762236985,
   'pec': 0.2834326800417488}},
 10: {1: {'matrix': <59999x12122 sparse matrix of type '<class 'numpy.int64'>'
   	with 9550576 stored elements in Compressed Sparse Row format>,
   'size': 12121,
   'max': 154362,
   'min': 10,
   'mean': 787.8713083649562,
   'pec': 0.