In [1]:
import pandas as pd
import numpy as np
from src.basic_project import Text_preprocessing
from src.basic_project import preprocessing, extract_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
texts, labels = extract_data('data/projects.xlsx', 'data/SciVocCodes.xlsx')
texts = preprocessing(texts)

In [3]:
from gensim.matutils import corpus2dense, corpus2csc
from gensim.corpora import Dictionary

D = Dictionary([sentence.split(' ') for sentence in texts])
n_tokens = len(D)

reviews_bow = [D.doc2bow(doc.split(' ')) for doc in texts]

from gensim.models import TfidfModel

tfidf = TfidfModel(reviews_bow)
reviews_tfidf = tfidf[reviews_bow]

num_docs = len(reviews_bow)

corpus_tfidf_sparse = corpus2csc(reviews_tfidf, num_terms=n_tokens, num_docs=num_docs).T

INFO - 15:42:36: adding document #0 to Dictionary<0 unique tokens: []>
INFO - 15:42:37: adding document #10000 to Dictionary<59683 unique tokens: ['action', 'affected', 'africa', 'african', 'already']...>
INFO - 15:42:38: adding document #20000 to Dictionary<90830 unique tokens: ['action', 'affected', 'africa', 'african', 'already']...>
INFO - 15:42:40: adding document #30000 to Dictionary<117447 unique tokens: ['action', 'affected', 'africa', 'african', 'already']...>
INFO - 15:42:40: built Dictionary<122697 unique tokens: ['action', 'affected', 'africa', 'african', 'already']...> from 32052 documents (total 5701572 corpus positions)
INFO - 15:42:40: Dictionary lifecycle event {'msg': "built Dictionary<122697 unique tokens: ['action', 'affected', 'africa', 'african', 'already']...> from 32052 documents (total 5701572 corpus positions)", 'datetime': '2023-01-03T15:42:40.416152', 'gensim': '4.3.0', 'python': '3.8.10 | packaged by conda-forge | (default, May 11 2021, 07:01:05) \n[GCC 9.3

In [5]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split

corpus_bow_train, corpus_bow_test, y_train, y_test = train_test_split(corpus_tfidf_sparse, labels, test_size=0.1, 
                                                                      random_state=42)

In [6]:
grid_param = {'kernel':('linear', 'rbf'), 'C':[0.1 ,1, 10]}
svc = svm.SVC(class_weight='balanced')
bow_grid = GridSearchCV(svc, grid_param, refit=True, cv = 2, n_jobs=-1)
bow_grid.fit(corpus_bow_train, y_train)

In [7]:
bow_grid.score(corpus_bow_test,y_test)

0.6681222707423581

In [8]:
from sklearn.metrics import balanced_accuracy_score

y_pred = bow_grid.predict(corpus_bow_test)
print(balanced_accuracy_score(y_pred, y_test))

0.6138700940248253


In [10]:
print(balanced_accuracy_score(y_test, y_pred))

0.6597171278603078


In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.82      0.70       488
           1       0.79      0.60      0.68      1326
           2       0.61      0.73      0.67       659
           3       0.38      0.41      0.40        73
           4       0.64      0.64      0.64       549
           5       0.65      0.76      0.70       111

    accuracy                           0.67      3206
   macro avg       0.61      0.66      0.63      3206
weighted avg       0.69      0.67      0.67      3206



In [14]:
reviews_tfidf

<gensim.interfaces.TransformedCorpus at 0x7f38a84049a0>

In [19]:
import torch
import torchtext
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.glove = torchtext.vocab.GloVe(name='6B', dim=300)
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        
        if self.texts[index] == None:
            embeddings = torch.zeros(200, 300)
        else:
            embeddings = self.glove.get_vecs_by_tokens(self.texts[index].split(' '), lower_case_backup=True)
        if embeddings.shape[0] > 200:
            embeddings = embeddings[:200]
        else:
            embeddings = torch.cat((torch.zeros(200-embeddings.shape[0], 300), embeddings), dim=0)
            
        return embeddings, int(self.labels[index])

In [20]:
dataset = CustomDataset(projects['title'], projects['euroSciVocCode'].to_numpy())
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [0.9, 0.1])
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)

INFO - 09:39:22: Loading vectors from .vector_cache/glove.6B.300d.txt.pt


In [28]:
class Model(torch.nn.Module):
    
    def __init__(self):
        
        super().__init__()
        
        self.lstm = torch.nn.LSTM(300, 100, 1, batch_first=True)
        self.mlp = torch.nn.Linear(100, 6)
        
    def forward(self, inputs):
        outputs, _ = self.lstm(inputs)
        outputs = self.mlp(outputs[:, -1, :])
        
        return outputs
    

In [29]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [30]:
def accuracy(logits: torch.Tensor, labels: torch.Tensor) -> float:
    """
    This method computes accuracy from logits and labels
    
    Parameters
    ----------
    logits : torch.Tensor
        batch of logits. Dimensions: [batch, number of classes]
    labels : torch.Tensor
        batch of labels. Dimensions: [batch]
        
    Returns
    -------
    float
        accuracy of predictions
    """

    # compute predictions
    predictions = logits.argmax(1).type_as(labels)

    # compute accuracy from predictions
    result = float(predictions.eq(labels).float().mean().cpu().detach().numpy())

    return result

In [31]:
import sklearn
y = projects['euroSciVocCode'].to_numpy()
class_weights=sklearn.utils.class_weight.compute_class_weight('balanced',classes=np.unique(y), y=y)
class_weights=torch.tensor(class_weights,dtype=torch.float)
class_weights

tensor([1.0387, 0.4266, 0.8065, 8.2694, 0.8960, 4.6291])

In [34]:
loss = torch.nn.CrossEntropyLoss()
model = Model().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(30):
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    model.train()
    
    for texts, labels in train_dataloader:
        texts = texts.to(device)
        labels = labels.to(device)
        
        outputs = model(texts)
        loss_value = loss(outputs, labels)
        
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        
        train_losses.append(loss_value.item())
        train_accuracies.append(accuracy(outputs, labels))
        
    model.eval()
    with torch.no_grad():
        
        for texts, labels in val_dataloader:
            texts = texts.to(device)
            labels = labels.to(device)
            
            outputs = model(texts)
            loss_value = loss(outputs, labels)
            
            val_losses.append(loss_value.item())
            val_accuracies.append(accuracy(outputs, labels))
            
    print(f'train loss: {np.mean(train_losses)}')
    print(f'val loss: {np.mean(val_losses)}')
    print(f'train accuracy: {np.mean(train_accuracies)}')
    print(f'val accuracy: {np.mean(val_accuracies)}')
        

train loss: 1.1552707181040165
val loss: 1.115949197457387
train accuracy: 0.5490807698363751
val accuracy: 0.5644831731915474


KeyboardInterrupt: 

In [70]:
import sklearn
class_weights=sklearn.utils.class_weight.compute_class_weight('balanced',classes=np.unique(y), y=y)
class_weights=torch.tensor(class_weights,dtype=torch.float)