In [1]:
import os
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from gensim.models import Doc2Vec

from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words

from sklearn.metrics import accuracy_score, f1_score

import random
random.seed(42)

In [2]:
## feature length of the vector
feature_length = 512

## load the trained doc2vec models (distributed memory and distributed bag of word)
model_dbow = Doc2Vec.load("/Users/williampham/Desktop/UvA-thesis-main/models/pdf_split_d2v_gensim_{}_db.mod".format(feature_length))
model_dmm = Doc2Vec.load("/Users/williampham/Desktop/UvA-thesis-main/models/pdf_split_d2v_gensim_{}_dm.mod".format(feature_length))

In [3]:

# Concatenate model
model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors


## clean text 
def cleanText(text):
    text = BeautifulSoup(text, "html.parser").text
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'\\n', r' ', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    # text = text.replace('x', '')
    return text

def print_complaint(df, index):
    example = df[df.index == index][["labels", "text"]].values[0]
    if len(example) > 0:
        print(example[1])
        print('labels:', example[0])

In [4]:
## loading the csv files containing filenames and processed text
df_0 = pd.read_csv('/Users/williampham/Desktop/UvA-thesis-main/data/0.0.csv')
df_1 = pd.read_csv('/Users/williampham/Desktop/UvA-thesis-main/data/1.0.csv')

frames = [df_0, df_1]

df = pd.concat(frames)
df.reset_index()
df.head()

df.fillna('', inplace=True)
df.head()

df['text_processed'] = df['text'].apply(cleanText)



In [5]:
## using dutch stop words to remove from the processed text (normally stop words do not contribute to the meaning of the text)
stop_words = get_stop_words('dutch')

## using the Tfidf model to vectorize the training data
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 5), stop_words=stop_words)
features = tfidf.fit_transform(df['text_processed']).toarray()
labels = df['labels']

In [6]:
## split the dataset to train set and test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

## calculate the weight for the imbalanced dataset
a = 1.0
b = len(y_train[y_train==0.0]) / len(y_train[y_train==1.0])
weights = {0.0:a, 1.0:b}

## define the logistic regression model
logreg = LogisticRegression(n_jobs=1, C=1e5, multi_class='ovr', class_weight='balanced')
## train logistic regression model
logreg.fit(X_train, y_train)
## test the logistic regression model
y_pred = logreg.predict(X_test)

os.makedirs('models/logreg', exist_ok=True)
filename = '/Users/williampham/Desktop/UvA-thesis-main/src/models/logreg_tfidf_model_{}__dbow_dm_concate.sav'.format(feature_length)
pickle.dump(logreg, open(filename, 'wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.8797129513612029
Testing F1 score: 0.8692540716136863
