In [3]:
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
def import_labelled_data(path="data/labelled/data.json", group_relevant=True):
    data = pd.read_json(path, encoding="latin-1")
    if group_relevant:
        data["class"] = data["class"].apply(
            lambda x: "relevant" if x != "irrelevant" else x
        )
    return data


print("Loading data...")

data = import_labelled_data(
    path="../data/labelled/data.json", group_relevant=False
)

# drop null classes
data = data.dropna(subset=["class"])

print("Data loaded.")

data = data.sample(frac=1).reset_index(drop=True)

data = data.sample(10)



data.info()

Loading data...
Data loaded.
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 8808 to 7824
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     10 non-null     object
 1   text    10 non-null     object
 2   class   10 non-null     object
dtypes: object(3)
memory usage: 320.0+ bytes


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

print("Vectorizing data...")

vectorizer = CountVectorizer(
    stop_words="english"
)

vectors = vectorizer.fit_transform(data["text"])


Vectorizing data...


In [10]:
vectors.shape

(10, 2070)

In [14]:
import torch
from torch import nn
from torch.nn import functional

class LogisticRegressionClassifier(nn.Module):

    def __init__(self, num_labels=2, vocab_size=vectors.shape[1]):
        super(LogisticRegressionClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, vector):
        return functional.softmax(self.linear(vector), dim=1)

model = LogisticRegressionClassifier()

In [15]:
with torch.no_grad():
    sample = vectors[0]
    print(model(sample))

TypeError: linear(): argument 'input' (position 1) must be Tensor, not csr_matrix