====== LDA MODEL ======

In [81]:
import pandas as pd
import numpy as np
from src.basic_project import preprocessing, extract_data
from gensim.models.ldamodel import LdaModel
from gensim.matutils import corpus2dense, corpus2csc
from gensim.corpora import Dictionary
#from gensim.models.wrappers import LdaMallet
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report




In [75]:
# Function to train models
def train_model(x_train, y_train, x_test, y_test):
    grid_param = {'kernel':('linear', 'rbf'), 'C':[0.1 ,1, 10]}
    svc = svm.SVC(class_weight='balanced')
    bow_grid = GridSearchCV(svc, grid_param, refit=True, cv = 2, n_jobs=-1)
    bow_grid.fit(x_train, y_train)
    return bow_grid, bow_grid.score(x_test,y_test)

In [31]:
texts, labels = extract_data('data/projects.xlsx', 'data/SciVocCodes.xlsx')
texts = preprocessing(texts)

 ====== TRAIN MODEL WITH LDA genism ======

In [18]:
# Create a corpus
D = Dictionary([sentence.split(' ') for sentence in texts])
n_tokens = len(D)
reviews_bow = [D.doc2bow(doc.split(' ')) for doc in texts]

In [55]:
num_topics = 200
ldag = LdaModel(corpus=reviews_bow, id2word=D, num_topics=num_topics)

In [71]:
corpus_ldag = ldag[reviews_bow]
num_docs = len(reviews_bow)
corpus_ldag_sparse = corpus2csc(corpus_ldag, num_terms=n_tokens, num_docs=num_docs).T

In [72]:
corpus_train,corpus_test, y_train, y_test = train_test_split(corpus_ldag_sparse, labels, test_size=0.3, random_state=42)

In [76]:
bow_grid,score = train_model(corpus_train, y_train, corpus_test, y_test)

In [77]:
print(score)

0.6132487520798668


In [79]:
y_pred = bow_grid.predict(corpus_test)
print(balanced_accuracy_score(y_pred, y_test))

0.5497852540673918


In [80]:
print(balanced_accuracy_score(y_test, y_pred))

0.5732047411388569


In [82]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.77      0.68      1507
           1       0.71      0.57      0.63      3794
           2       0.56      0.65      0.61      2045
           3       0.35      0.32      0.33       205
           4       0.57      0.56      0.57      1715
           5       0.50      0.57      0.53       350

    accuracy                           0.61      9616
   macro avg       0.55      0.57      0.56      9616
weighted avg       0.62      0.61      0.61      9616



======= LDA MALLET =======