In [11]:
from bs4 import BeautifulSoup
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument

The steps would be:
1. load the extracted text in 2 csv files and concatenate them as one
2. Use nltk library to tokenize the extracted text
3. Build the vocabularies
4. Train a Doc2Vec model
5. Save the Doc2Vec model

In [5]:
df_0 = pd.read_csv('data/0.0.csv')
df_1 = pd.read_csv('data/1.0.csv')
frames = [df_0, df_1]
  
df = pd.concat(frames)
df.reset_index()
df.head()

Unnamed: 0,labels,paths,text
0,0.0,868212__concatenated_p_1.jpg,Handreiking | Veilige Moskee\n\nInhoudsopgave\...
1,0.0,868212__concatenated_p_10.jpg,Handreiking | Veilige Moskee\n\nLandelijke wer...
2,0.0,868212__concatenated_p_100.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...
3,0.0,868212__concatenated_p_101.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...
4,0.0,868212__concatenated_p_102.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...


In [6]:
def cleanText(text):
    text = BeautifulSoup(text, "html.parser").text
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'\\n', r' ', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

def print_complaint(df, index):
    example = df[df.index == index][["labels", "text"]].values[0]
    if len(example) > 0:
        print(example[1])
        print('labels:', example[0])

In [7]:
df.fillna('', inplace=True)
df.head()

Unnamed: 0,labels,paths,text
0,0.0,868212__concatenated_p_1.jpg,Handreiking | Veilige Moskee\n\nInhoudsopgave\...
1,0.0,868212__concatenated_p_10.jpg,Handreiking | Veilige Moskee\n\nLandelijke wer...
2,0.0,868212__concatenated_p_100.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...
3,0.0,868212__concatenated_p_101.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...
4,0.0,868212__concatenated_p_102.jpg,Gemeente Amsterdam Datum 28 mei 2015\nKenmerk\...


In [8]:
df['text_processed'] = df['text'].apply(cleanText)



In [12]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text_processed']), tags=[r.labels]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text_processed']), tags=[r.labels]), axis=1)

In [None]:
import multiprocessing
from tqdm import tqdm
from gensim.models import Doc2Vec

cores = multiprocessing.cpu_count()
d2v_Model = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
d2v_Model.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
from sklearn import utils

epochs = 50
for epoch in range(epochs):
    d2v_Model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v_Model.alpha -= 0.002
    d2v_Model.min_alpha = d2v_Model.alpha

fname = "./model_runs/pdf_split_d2v.mod"
d2v_Model.save(fname)

In [70]:
model = Doc2Vec.load("./model_runs/pdf_split_d2v.mod")
vec_test = model.infer_vector(['title', 'love', 'cat', 'chapter'])
print(vec_test)

[ 1.96141694e-02  2.88925953e-02  3.99991348e-02 -3.03983167e-02
 -1.51108112e-02 -4.21896623e-03  3.59616205e-02  2.17935592e-02
  4.93520051e-02 -1.44191049e-02 -1.54240988e-02 -2.99990624e-02
  2.35345103e-02  2.05198899e-02  1.63179319e-02  2.84231380e-02
 -7.31207430e-04 -2.21286397e-02  2.50699408e-02  1.38327805e-02
  7.60966726e-03 -1.77247263e-03  1.46660432e-02 -6.66098669e-04
 -9.64964274e-03  1.18551124e-02  1.23616708e-02  1.37782237e-02
 -1.37629267e-02  2.07799468e-02 -1.57081075e-02 -1.60328429e-02
  1.40209235e-02  1.50070805e-02 -8.36930121e-04 -5.40789310e-03
 -1.98698835e-03  1.33919120e-02  1.00629404e-02 -7.52525637e-03
  2.49000778e-03  2.81572193e-02 -2.00807527e-02  3.16771269e-02
 -1.47041008e-02 -4.32784203e-04 -1.30706392e-02  1.29079623e-02
 -1.72178373e-02  3.21473479e-02 -2.10126396e-03  8.22720304e-03
  1.53654302e-02  2.51924954e-02  3.79121909e-03 -2.87226140e-02
 -1.69163458e-02 -1.11198146e-03  1.31094456e-03 -3.13359462e-02
 -7.28098443e-04  7.29365