In [1]:
import pandas as pd
from tqdm.notebook import tqdm,trange

In [2]:
def import_labelled_data(path="data/labelled/data.json", group_relevant=True):
    data = pd.read_json(path, encoding="latin-1")
    if group_relevant:
        data["class"] = data["class"].apply(
            lambda x: "relevant" if x != "irrelevant" else x
        )
    return data


print("Loading data...")

data = import_labelled_data(
    path="../../data/level-0.5/data.json", group_relevant=False
)

print("Data loaded.")

data = data.sample(frac=1).reset_index(drop=True)

data = data.sample(2500)



data.info()

Loading data...
Data loaded.
<class 'pandas.core.frame.DataFrame'>
Index: 2500 entries, 4107 to 5998
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           2500 non-null   object
 1   text          2500 non-null   object
 2   relevance     2500 non-null   object
 3   multiclasses  2500 non-null   object
dtypes: object(4)
memory usage: 97.7+ KB


In [3]:
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(data["text"], data["relevance"], test_size=0.2, random_state=42)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

print("Vectorizing data...")

vectorizer = CountVectorizer(
    stop_words="english"
)


Vectorizing data...


In [5]:
time = %timeit -n1 -r1 -o vectorizer.fit(data['text'][:500])

print(f"Vectorization took {time.average:.2f} seconds.")
print(f"Vectorization of the whole dataset is estimated to take {time.average * len(data) / 500:.2f} seconds.")

10 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Vectorization took 10.03 seconds.
Vectorization of the whole dataset is estimated to take 50.16 seconds.


In [6]:
time = %timeit -n1 -r1 -o vectorizer.transform(data['text'][:500])

print(f"Vectorization took {time.average:.2f} seconds.")
print(f"Vectorization of the whole dataset is estimated to take {time.average * len(data) / 500:.2f} seconds.")

8.89 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Vectorization took 8.89 seconds.
Vectorization of the whole dataset is estimated to take 44.45 seconds.


In [7]:
print("Fitting vectorizer...")
vectorizer.fit(data["text"])
print("Vectorizer fitted.")


print("Vectorizing data...")
xTrainVector = vectorizer.transform(xTrain)
print("Training data vectorized.")
xTestVector = vectorizer.transform(xTest)
print("Testing data vectorized.")

yTrainVector = yTrain.apply(lambda x: 1 if x == "relevant" else 0)
yTestVector = yTest.apply(lambda x: 1 if x == "relevant" else 0)


Fitting vectorizer...
Vectorizer fitted.
Vectorizing data...
Training data vectorized.
Testing data vectorized.


In [8]:
VOCAB_SIZE = len(vectorizer.vocabulary_)
NUM_CLASSES = len(data["relevance"].unique())

In [9]:
import torch
from torch import nn
from torch.nn import functional

class LogisticRegressionClassifier(nn.Module):

    def __init__(self, num_labels=NUM_CLASSES, vocab_size=VOCAB_SIZE, hidden_dim=3):
        super(LogisticRegressionClassifier, self).__init__()

        # two linear layers then sigmoid
        self.linear = nn.Linear(vocab_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, vector):
        x = self.linear(vector)
        x = self.linear2(x)
        return self.sigmoid(x)[:,-1]

model = LogisticRegressionClassifier()

In [10]:
xTrainTensor = torch.tensor(xTrainVector.todense(), dtype=torch.float16)
xTestTensor = torch.tensor(xTestVector.todense(), dtype=torch.float16)
yTrainTensor = torch.tensor(yTrainVector.values, dtype=torch.float16)
yTestTensor = torch.tensor(yTestVector.values, dtype=torch.float16)



In [11]:
with torch.no_grad():
    sample = xTrainTensor[0:1]
    print(model(sample))

tensor([0.5295])


In [12]:
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(xTrainTensor, yTrainTensor)
test_data = TensorDataset(xTestTensor, yTestTensor)

In [13]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True,)

In [14]:
model = LogisticRegressionClassifier(hidden_dim=3)
print(model)

LogisticRegressionClassifier(
  (linear): Linear(in_features=527099, out_features=3, bias=True)
  (linear2): Linear(in_features=3, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)


In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn = nn.BCELoss()

In [16]:
EPOCHS = 25


with trange(EPOCHS) as epochs:
    for epoch in epochs:
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch",leave=False,position=0):
            optimizer.zero_grad()
            y_pred = model(x)
            loss = loss_fn(y_pred, y)

            loss.backward()
            optimizer.step()
        epochs.set_postfix(loss=loss.item())


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 2/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 3/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 4/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 5/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 6/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 7/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 8/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 9/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 10/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 11/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 12/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 13/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 14/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 15/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 16/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 17/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 18/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 19/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 20/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 21/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 22/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 23/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 24/25:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch 25/25:   0%|          | 0/63 [00:00<?, ?batch/s]

In [17]:
with torch.no_grad():
    model.eval()
    yPred = model(xTestTensor)
    yPred = yPred.squeeze().round().detach().numpy()

print(yPred)

[0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0.
 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1.

In [18]:
from sklearn.metrics import classification_report

print(classification_report(yTestVector, yPred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       272
           1       1.00      1.00      1.00       228

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



In [23]:
text = """Shark catch rates are higher in pelagic longline fisheries than in any other fishery, and sharks are
typically discarded (bycatch) at sea. The post-release fate of discarded sharks is largely
unobserved and could pose a significant source of unquantified mortality that may change stock
assessment outcomes and prevent sound conservation and management advice. This study
assessed post-release mortality rates of blue (Prionace glauca), bigeye thresher (Alopias
superciliosus), oceanic whitetip (Carcharhinus longimanus), silky (C. falciformis) and shortfin
mako (Isurus oxyrhincus) sharks discarded in the Hawaii deep-set and American Samoa longline
fisheries targeting tuna in the central Pacific Ocean. The impacts on survival rates were
examined considering species, fishery, fishing gear configuration, handling method, animal
condition at capture and at release, and the amount of trailing fishing gear remaining on
discarded sharks. Bayesian survival analysis showed that the condition at release (good vs.
injured), branchline leader material, and the amount of trailing fishing gear left on the animals
were among the factors that had the largest effect on post-release fate—animals captured on
monofilament branchline leaders and released in good condition without trailing fishing gear had
the highest rates of survival. This study shows that fisher behavior can have a significant impact
on pelagic shark post-release mortality. Ensuring that sharks are handled carefully and released
with minimal amounts of trailing fishing gear may reduce fishing mortality on shark populations."""

text = """NumPy is an essential package for high-performance scientific computing and data analysis in the Python ecosystem. It is the foundation of many higher-level tools such as Pandas and scikit-learn.

TensorFlow also uses NumPy arrays as the foundation for building Tensor objects and graph flow for deep learning tasks. These heavily rely on linear algebra operations on large lists, vectors, and matrices of numbers.

NumPy is faster because it uses vectorized implementation, and many of its core functions are written in C."""

In [24]:
torch.save(model.state_dict(), "models/LinearRegression.pth")

In [26]:
v = vectorizer.transform([text])
t = torch.tensor(v.todense(), dtype=torch.float32)

In [27]:
model(t)

tensor([0.3013], grad_fn=<SelectBackward0>)