In [None]:
# Vereiste imports

import os, nltk, re, pdftotext, docx2txt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import interp
from itertools import cycle
from stop_words import get_stop_words
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

In [None]:
# Functie voor het verwerken van een map met data
# Input: Filepath (String), Nummer van de categorie, Naam van de categorie (String)
# Output: Pandas DataFrame

def process_docs(directory, category_num, category_desc):
    data1 = []
    # Vaak voorkomende irrelevante woorden
    filtered_words = ['creativ','common','kennisnet','copyright','http','licentie']
    for filename in os.listdir(directory):
        #print(filename)
        #print("")
        if filename.endswith((".pdf", ".PDF")):
            with open(directory+"/"+filename, "rb") as f:
                pdf = pdftotext.PDF(f)
            text=("\n\n".join(pdf))
            # Wanneer document ingescand is, kan text extracted worden via tesseract
            if text != "":
                text = text
            else:
                try:
                    text = textract.process((directory+"/"+filename), method='tesseract',
                                            language='nld', errors='ignore')
                    text = text.decode("utf-8")
                except Exception:
                    print(filename, "textract.process & text.decode")
                    pass
            # Tokenizen van de tekst
            tokens = word_tokenize(text)
            # Controleer op en verwijder NL-talige stopwoorden
            stop_words = get_stop_words('dutch')
            keywords = [word for word in tokens if not word in stop_words]
            # Stem alle woorden, alle vervoegingen van een werkwoord worden teruggebracht tot de stam
            from nltk.stem.snowball import DutchStemmer
            DutchStemmer = DutchStemmer()
            keywords = [DutchStemmer.stem(word) for word in keywords]
            # Verwijder alle niet-alfabetische karakters
            keywords2 = [re.sub(r"\s*[^A-Za-z]+\s*", '', word) for word in keywords]
            # Verwijder dubbele spaties tussen woorden
            keywords3 = [re.sub(' +',' ',word) for word in keywords2]
            # Verwijder woorden die kleiner zijn dan 3 letters
            keywords3 = [re.sub(r'\b\w{1,3}\b', '', word) for word in keywords3]
            # Vervang alle hoofdletters door kleine letters
            keywords3 = [word.lower() for word in keywords3]
            # Verwijder extreem lange woorden, groter dan 20 letters
            keywords3 = [word for word in keywords3 if len(word) <= 20]
            # Verwijder veel voorkomende, irrelevante woorden
            keywords3 = [word for word in keywords3 if word.strip()]
            keywords3 = [word for word in keywords3 if not word in filtered_words]
            keywords3 = ' '.join(keywords3)
            # Check of document minder dan 100 woorden bevat, in dat geval overslaan
            if len(keywords3) <= 100:
                print(filename, '<= 100 words')
                print(len(keywords3))
                print("")
                continue
            else:
                data1.extend([[category_num,category_desc,keywords3]])
            continue
        elif filename.endswith((".docx", ".DOCX")):
            f = str(directory+"/"+filename)
            text = docx2txt.process(f)
            # Tokenizen van de tekst
            tokens = word_tokenize(text)
            # Controleer op en verwijder NL-talige stopwoorden
            stop_words = get_stop_words('dutch')
            keywords = [word for word in tokens if not word in stop_words]
            # Stem alle woorden, alle vervoegingen van een werkwoord worden teruggebracht tot de stam
            from nltk.stem.snowball import DutchStemmer
            DutchStemmer = DutchStemmer()
            keywords = [DutchStemmer.stem(word) for word in keywords]
            # Verwijder alle niet-alfabetische karakters
            keywords2 = [re.sub(r"\s*[^A-Za-z]+\s*", '', word) for word in keywords]
            # Verwijder dubbele spaties tussen woorden
            keywords3 = [re.sub(' +',' ',word) for word in keywords2]
            # Verwijder woorden die kleiner zijn dan 3 letters
            keywords3 = [re.sub(r'\b\w{1,3}\b', '', word) for word in keywords3]
            # Vervang alle hoofdletters door kleine letters
            keywords3 = [word.lower() for word in keywords3]
            # Verwijder extreem lange woorden, groter dan 20 letters
            keywords3 = [word for word in keywords3 if len(word) <= 20]
            # Verwijder veel voorkomende, irrelevante woorden
            keywords3 = [word for word in keywords3 if word.strip()]
            keywords3 = [word for word in keywords3 if not word in filtered_words]
            keywords3 = ' '.join(keywords3)
            # Check of document minder dan 100 woorden bevat, in dat geval overslaan
            if len(keywords3) <= 100:
                print(filename, '<= 100 words')
                print(len(keywords3))
                print("")
                continue
            else:
                data1.extend([[category_num,category_desc,keywords3]])
            continue
        else:
            continue
    df = pd.DataFrame(data1, columns = ['CategoryID', 'Category', 'Text'])
    return(df)

In [None]:
# Genereer per map/categorie een apart dataframe.

df = process_docs("/path/to/category/les", 1, "Les")
df2 = process_docs("/path/to/category/opdracht", 2, "Opdracht")
#df3 = process_docs("/path/to/category/toets", 3, "Toets")
df4 = process_docs("/path/to/category/referentie", 4, "Referentie")

# Voeg de dataframes samen tot een enkel dataframe.

df = df.append(df2)
#df = df.append(df3)
df = df.append(df4)

# Verwijder eventuele lege entries uit de dataframe, reset de nummering van de index 
# en sla het dataframe op als .pkl-bestand.

df = df[df.Text != '']
df = df.reset_index(drop=True)
df.to_pickle("texts_pdfs.pkl")

In [None]:
# Genereer dictionaries voor plots.

category_id_df = df[['Category', 'CategoryID']].drop_duplicates().sort_values('CategoryID')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['CategoryID', 'Category']].values)

# Check de onderlinge verdeling binnen de dataset.

%matplotlib inline
df.groupby('Category').Text.count().plot.bar(ylim=0)

In [None]:
# Roep de TF-IDF vectorizer aan.

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=6, norm='l2', encoding='utf-8', ngram_range=(1, 2), 
                        stop_words=(get_stop_words('dutch')), max_df=0.7, max_features=3000)

In [None]:
# Pas TF-IDF toe op de data.

features = tfidf.fit_transform(df.Text).toarray()
labels = df.CategoryID
features.shape

In [None]:
# Plot welke woorden per categorie het meest aan elkaar zijn gecorreleerd.

N = 3
for category, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
# Plot de TF-IDF vector per document.

SAMPLE_SIZE = int(len(features))
np.random.seed(2)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices])
colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey']
for category, category_id in sorted(category_to_id.items()):
    points = projected_features[(labels[indices] == category_id).values]
    plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)
plt.title("tf-idf feature vector for each document, projected on 2 dimensions.",
          fontdict=dict(fontsize=12)) 
plt.legend()

In [None]:
# Roep het Logistic Regression-model aan.

model2 = LogisticRegression(solver='lbfgs',random_state=42, multi_class='auto')

# Splits de dataset in een train- en test-set.

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)

# Fit het model op de data.

model2.fit(X_train, y_train)
print("Accuracy:",model2.score(X_test, y_test))

# Laat het model een voorspelling doen op de test-set.

y_pred_proba = model2.predict_proba(X_test)
y_pred = model2.predict(X_test)

# Plot een confusion matrix van de resultaten op de test-set.

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')

In [None]:
# Valideer het getrainde model evt. op een map met aparte bestanden.

directory = "/path/to/folder"
correct = 0
incorrect = 0

for filename in os.listdir(directory):
        #print(filename)
        #print("")
        if filename.endswith((".pdf", ".PDF")):
            with open(directory+"/"+filename, "rb") as f:
                extracted_text = []
                pdf = pdftotext.PDF(f)
                text=("\n\n".join(pdf))
                # Wanneer document ingescand is, kan text extracted worden via tesseract
                if text != "":
                    text = text
                else:
                    try:
                        text = textract.process((directory+"/"+filename), method='tesseract',
                                                language='nld', errors='ignore')
                        text = text.decode("utf-8")
                    except Exception:
                        print(filename, "textract.process & text.decode")
                        pass
                # Tokenizen van de tekst
                tokens = word_tokenize(text)
                # Controleer op en verwijder NL-talige stopwoorden
                stop_words = get_stop_words('dutch')
                keywords = [word for word in tokens if not word in stop_words]
                # Stem alle woorden, alle vervoegingen van een werkwoord worden teruggebracht tot de stam
                from nltk.stem.snowball import DutchStemmer
                DutchStemmer = DutchStemmer()
                keywords = [DutchStemmer.stem(word) for word in keywords]
                # Verwijder alle niet-alfabetische karakters
                keywords2 = [re.sub(r"\s*[^A-Za-z]+\s*", '', word) for word in keywords]
                # Verwijder dubbele spaties tussen woorden
                keywords3 = [re.sub(' +',' ',word) for word in keywords2]
                # Verwijder woorden die kleiner zijn dan 3 letters
                keywords3 = [re.sub(r'\b\w{1,3}\b', '', word) for word in keywords3]
                # Vervang alle hoofdletters door kleine letters
                keywords3 = [word.lower() for word in keywords3]
                # Verwijder extreem lange woorden, groter dan 20 letters
                keywords3 = [word for word in keywords3 if len(word) <= 20]
                # Verwijder veel voorkomende, irrelevante woorden
                keywords3 = [word for word in keywords3 if word.strip()]
                keywords3 = ' '.join(keywords3)
                extracted_text.append(keywords3)
                text_features = tfidf.transform(extracted_text)
                predictions = model2.predict(text_features)
                if "les" in filename and predictions == 1:
                    #print(filename)
                    #print("Correct")
                    #print("")
                    correct += 1
                elif "opdracht" in filename and predictions == 2:
                    #print(filename)
                    #print("Correct")
                    #print("")
                    correct += 1
                elif "referentie" in filename and predictions == 4:
                    #print(filename)
                    #print("Correct")
                    #print("")
                    correct += 1
                else:
                    print(filename)
                    print("Incorrect, guess was", predictions)
                    incorrect += 1
                    print("")
        elif filename.endswith((".docx", ".DOCX")):
            extracted_text = []
            f = str(directory+"/"+filename)
            text = docx2txt.process(f)
            # Tokenizen van de tekst
            tokens = word_tokenize(text)
            # Controleer op en verwijder NL-talige stopwoorden
            stop_words = get_stop_words('dutch')
            keywords = [word for word in tokens if not word in stop_words]
            # Stem alle woorden, alle vervoegingen van een werkwoord worden teruggebracht tot de stam
            from nltk.stem.snowball import DutchStemmer
            DutchStemmer = DutchStemmer()
            keywords = [DutchStemmer.stem(word) for word in keywords]
            # Verwijder alle niet-alfabetische karakters
            keywords2 = [re.sub(r"\s*[^A-Za-z]+\s*", '', word) for word in keywords]
            # Verwijder dubbele spaties tussen woorden
            keywords3 = [re.sub(' +',' ',word) for word in keywords2]
            # Verwijder woorden die kleiner zijn dan 3 letters
            keywords3 = [re.sub(r'\b\w{1,3}\b', '', word) for word in keywords3]
            # Vervang alle hoofdletters door kleine letters
            keywords3 = [word.lower() for word in keywords3]
            # Verwijder extreem lange woorden, groter dan 20 letters
            keywords3 = [word for word in keywords3 if len(word) <= 20]
            # Verwijder veel voorkomende, irrelevante woorden
            keywords3 = [word for word in keywords3 if word.strip()]
            keywords3 = ' '.join(keywords3)
            extracted_text.append(keywords3)
            text_features = tfidf.transform(extracted_text)
            predictions = model2.predict(text_features)
            if "les" in filename and predictions == 1:
                #print(filename)
                #print("Correct")
                #print("")
                correct += 1
            elif "opdracht" in filename and predictions == 2:
                #print(filename)
                #print("Correct")
                #print("")
                correct += 1
            elif "referentie" in filename and predictions == 4:
                #print(filename)
                #print("Correct")
                #print("")
                correct += 1
            else:
                print(filename)
                print("Incorrect, guess was", predictions)
                incorrect += 1
                print("")
        else:
            continue
                    
print("Correct:", correct)
print("Incorrect:", incorrect)

# 1 Les
# 2 Opdracht
# 3 Toets
# 4 Referentie

In [None]:
# Plot de ROC Curve. De classes en n_classes moeten worden aangepast 
# wanneer er meer categorieen (toets) aan de dataset worden toegevoegd

y = label_binarize(labels, classes=[1, 2, 4])
n_classes = 3

random_state = np.random.RandomState(0)
n_samples, n_features = features.shape
X = features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and ROC area

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve per class')
plt.legend(loc="lower right")
plt.show()