In [2]:
# machine learning and NLP libraries
#import torchtext
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn import svm
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from gensim.matutils import corpus2dense, corpus2csc
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models.ldamodel import LdaModel

# other libraries
import warnings

# own modules
from src.basic_project import preprocessing, extract_data, BasicModel

# ignore warnings
warnings.filterwarnings('ignore')

## Data Extraction and Pre-Processing

In [3]:
texts, labels, costs, contributions = extract_data('data/projects.xlsx', 'data/SciVocCodes.xlsx')
texts = preprocessing(texts)

In [4]:
# get texts in right format
D = Dictionary([sentence.split(' ') for sentence in texts])
n_tokens = len(D)
reviews_bow = [D.doc2bow(doc.split(' ')) for doc in texts]

## Basic Vectorization Techniques

### Tf-Idf

In [6]:
# compute tfidf representations
tfidf = TfidfModel(reviews_bow)
reviews_tfidf = tfidf[reviews_bow]

# create sparse corpus
num_docs = len(reviews_bow)
corpus_tfidf_sparse = corpus2csc(reviews_tfidf, num_terms=n_tokens, num_docs=num_docs).T

In [None]:
model = BasicModel(corpus_tfidf_sparse, labels)

In [None]:
print(f'accuracy: {model.accuracy}')

In [None]:
print(model.classification_report)

### Embeddings

In [8]:
embeddings = np.zeros((len(texts), 300))
glove = torchtext.vocab.GloVe(name='6B', dim=300)
for i in range(len(texts)):
    embeddings[i] = glove.get_vecs_by_tokens(texts[i].split(' '), lower_case_backup=True).mean(dim=0).numpy()

In [9]:
model = BasicModel(embeddings, labels)

In [15]:
print(f'accuracy: {model.accuracy}')

accuracy: 0.6768857690045743


In [16]:
print(model.classification_report)

              precision    recall  f1-score   support

           0       0.58      0.82      0.68       488
           1       0.81      0.52      0.63      1326
           2       0.60      0.70      0.64       659
           3       0.25      0.64      0.36        73
           4       0.61      0.59      0.60       549
           5       0.49      0.79      0.60       111

    accuracy                           0.63      3206
   macro avg       0.56      0.68      0.59      3206
weighted avg       0.67      0.63      0.63      3206



## Basic Vectorization Techniques with Additional Features

In [None]:
costs = normalize(np.expand_dims(costs, 1), axis=1)
contributions = normalize(np.expand_dims(costs, 1))

### Tf-Idf

In [None]:
dataset = hstack((corpus_tfidf_sparse, csr_matrix(costs), csr_matrix(contributions)))

In [None]:
model = BasicModel(dataset, labels)

In [None]:
print(f'accuracy: {model.accuracy}')

In [None]:
print(model.classification_report)

### Embeddings

In [None]:
embeddings = np.zeros((len(texts), 300))
glove = torchtext.vocab.GloVe(name='6B', dim=300)
for i in range(len(texts)):
    embeddings[i] = glove.get_vecs_by_tokens(texts[i].split(' '), lower_case_backup=True).mean(dim=0).numpy()

In [37]:
dataset = np.concatenate((embeddings, costs, contributions), axis=1)

In [42]:
model = BasicModel(dataset, labels)

In [43]:
print(f'accuracy: {model.accuracy}')

accuracy: 0.6770114603067362


In [44]:
print(model.classification_report)

              precision    recall  f1-score   support

           0       0.58      0.82      0.68       488
           1       0.81      0.52      0.63      1326
           2       0.60      0.70      0.64       659
           3       0.25      0.64      0.36        73
           4       0.61      0.59      0.60       549
           5       0.49      0.79      0.60       111

    accuracy                           0.63      3206
   macro avg       0.56      0.68      0.59      3206
weighted avg       0.67      0.63      0.63      3206



### ======== LDA MODEL ========

LDA

In [46]:
# compute LDA model
num_topics = 90
ldag = LdaModel(corpus=reviews_bow, id2word=D, num_topics=num_topics)

In [47]:
# create sparse corpus
corpus_ldag = ldag[reviews_bow]
num_docs = len(reviews_bow)
corpus_ldag_sparse = corpus2csc(corpus_ldag, num_terms=n_tokens, num_docs=num_docs).T

In [48]:
model = BasicModel(corpus_ldag_sparse, labels)

In [49]:
print(f'accuracy: {model.accuracy*100:.2f}%')

accuracy: 63.94%


In [51]:
print(model.classification_report)

              precision    recall  f1-score   support

           0       0.57      0.81      0.67       488
           1       0.78      0.50      0.61      1326
           2       0.56      0.67      0.61       659
           3       0.17      0.47      0.25        73
           4       0.58      0.48      0.53       549
           5       0.42      0.81      0.55       111

    accuracy                           0.59      3206
   macro avg       0.51      0.62      0.54      3206
weighted avg       0.64      0.59      0.59      3206

