In [None]:
# Vereiste imports

import textract, os, PyPDF2, nltk, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import interp
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.manifold import TSNE
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm, datasets
from itertools import cycle
nltk.download('punkt')

In [None]:
# Verwijzing naar CSV-bestand

file = 'Video-Texts-NL.csv'

# Lees het CSV-bestand in

df = pd.read_csv(file,sep=';')
category_id_df = df[['Category', 'CategoryID']].drop_duplicates().sort_values('CategoryID')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['CategoryID', 'Category']].values)
df.sample(5)

In [None]:
# Verwerkt de tekst per video

text_list = df["Text"].values
new_list = []
for text in text_list:
    # Tokenizen van de tekst
    tokens = word_tokenize(text)
    # Controleer op en verwijder NL-talige stopwoorden
    stop_words = get_stop_words('dutch')
    keywords = [word for word in tokens if not word in stop_words]
    # Stem alle woorden, alle vervoegingen van een werkwoord worden teruggebracht tot de stam
    from nltk.stem.snowball import DutchStemmer
    DutchStemmer = DutchStemmer()
    keywords = [DutchStemmer.stem(word) for word in keywords]
    # Verwijder alle niet-alfabetische karakters
    keywords2 = [re.sub(r"\s*[^A-Za-z]+\s*", '', word) for word in keywords]
    # Verwijder dubbele spaties tussen woorden
    keywords3 = [re.sub(' +',' ',word) for word in keywords2]
    # Vervang alle hoofdletters door kleine letters
    keywords3 = [word.lower() for word in keywords3]
    # Verwijder extreem lange woorden, groter dan 20 letters
    keywords3 = [word for word in keywords3 if len(word) <= 20]
    # Verwijder veel voorkomende, irrelevante woorden
    keywords3 = [word for word in keywords3 if word.strip()]
    keywords3 = ' '.join(keywords3)
    new_list.append(keywords3)
df.drop(columns=['Text'])
df['Text'] = new_list
df

In [None]:
# Frequentie per categorie

df['Category'].value_counts()

In [None]:
# Roep de TF-IDF vectorizer aan.

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

# Pas TF-IDF toe op de data.

features = tfidf.fit_transform(df.Text).toarray()
labels = df.CategoryID
features.shape

In [None]:
# Plot welke woorden per categorie het meest aan elkaar zijn gecorreleerd.

N = 5
for category, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
# Plot de TF-IDF vector per document.

SAMPLE_SIZE = int(len(features))
np.random.seed(250694)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices])
colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey', 'yellow', 'red', 'black']
for category, category_id in sorted(category_to_id.items()):
    points = projected_features[(labels[indices] == category_id).values]
    plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)
plt.title("tf-idf feature vector for each category, projected on 2 dimensions.",
          fontdict=dict(fontsize=12))
plt.legend()

In [None]:
# Roep het Logistic Regression-model aan.

model = LogisticRegression(solver='lbfgs',random_state=42, multi_class='auto')

# Splits de dataset in een train- en test-set.

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.99, random_state=42)

# Fit het model op de data.

model.fit(X_test, y_test)
print("Accuracy:",model.score(X_test, y_test))

# Laat het model een voorspelling doen op de test-set.

y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)

# Plot een confusion matrix van de resultaten op de test-set.

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')

In [None]:
# Welke woorden dragen het meeste bij aan de toekenning aan een bepaalde categorie.

model.fit(features, labels)

N = 5
for category, category_id in sorted(category_to_id.items()):
    indices = np.argsort(model.coef_[(category_id-1)])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(category))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
# Plot de ROC Curve. De classes en n_classes moeten worden aangepast 
# wanneer er meer categorieen aan de dataset worden toegevoegd

y = label_binarize(labels, classes=[1, 2, 3, 4, 5, 6, 7])
n_classes = 7

random_state = np.random.RandomState(0)
n_samples, n_features = features.shape
X = features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                    random_state=2)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and ROC area

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'gray', 'purple'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color,# lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')#, lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()