In [1]:
import json
import nltk
import os
import string
import unicodedata

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb

from nltk.corpus import stopwords
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
json_dir = Path(r"../data/dvlog_text")
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")

In [3]:
# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,test


In [4]:
# loop over each text file and extract the text
text_ref_dict = {}

for json_file in os.listdir(json_dir):
    
    # get the video_id and setup the path to the file
    video_id = int(json_file.split("_")[0])
    json_path = os.path.join(json_dir, json_file)
    
    with open(json_path) as current_file:
        json_dict = json.loads(current_file.read())

    text_ref_dict[video_id] = json_dict["text"]

In [5]:
# put the annotations back into the dataframe
df_annotations["text"] = df_annotations["video_id"].apply(lambda x: text_ref_dict.get(x))
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset,text
0,0,1,f,train,So I wanted to come on here and sit down with...
1,1,1,f,test,"Hello guys, it's me again and I'm going to ta..."
2,2,1,m,train,Welcome back to another video today. I'm goin...
3,3,1,m,train,Hi everybody and welcome to and the clouds br...
4,4,1,f,test,"Hey, me and McCrown, me and my hairy hair pit..."


##
train with xgboost and 1 for every word
train with xgboost and 1 for every word and remove certain keywords

In [6]:
# do some preprocessing to shrink the vocabulary (including lemmatization)
def preprocess_string(text: str, stop_words: set = set(stopwords.words("english")), unicode_pattern: str = "NFKD") -> str:
    text = text.replace("\n", " ").strip()  # Remove newlines and trailing whitespace
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove puctuation with lookup table
    text = text.lower()  # Lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = text.translate(str.maketrans("", "", string.digits))  # Remove all numbers with lookup table
    
    # Remove excess whitespace in between words
    # E.g. the sentence "for 10 days" becomes "for days" instead of "for  days" with two spaces
    text = " ".join(text.split())
    text = unicodedata.normalize(unicode_pattern, text)  # Strip accents from characters
    
    # Lemmatization
    lemmanizer = nltk.stem.WordNetLemmatizer()
    text = " ".join([lemmanizer.lemmatize(word) for word in text.split()])

    return text

df_annotations["preprocessed"] = df_annotations["text"].apply(preprocess_string)
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset,text,preprocessed
0,0,1,f,train,So I wanted to come on here and sit down with...,wanted come sit guy kind talk vent really stru...
1,1,1,f,test,"Hello guys, it's me again and I'm going to ta...",hello guy im going talk today survive depressi...
2,2,1,m,train,Welcome back to another video today. I'm goin...,welcome back another video today im going expl...
3,3,1,m,train,Hi everybody and welcome to and the clouds br...,hi everybody welcome cloud break time talked m...
4,4,1,f,test,"Hey, me and McCrown, me and my hairy hair pit...",hey mccrown hairy hair pit im mess im mess hea...


In [7]:
# get the data subsets
df_train = df_annotations[df_annotations["dataset"] == "train"]
df_test = df_annotations[df_annotations["dataset"] == "test"]
df_val = df_annotations[df_annotations["dataset"] == "val"]

In [8]:
# setup the bag of words model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(df_train["preprocessed"].to_list())

# retrieve the feature names
feature_names = vectorizer.get_feature_names_out()

# set the count vectorizer to 0 or 1
X_train = X_train.toarray()
X_train = np.where(X_train > 1, 1, X_train)
y_train = df_train["label"].to_list()

In [9]:
# setup the evaluation script
def calculate_performance_measures(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")

    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-score: {fscore}\n--------")

In [10]:
# train a randomforestclassifier
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)

# evaluate the model on the train set
y_pred = clf.predict(X_train)
calculate_performance_measures(y_train, y_pred)

# evaluate the model on the eval set
X_val = vectorizer.fit_transform(df_train["preprocessed"].to_list())

Accuracy: 0.9965397923875432
Precision: 0.9965397923875432
Recall: 0.9965397923875432
F1-score: 0.9965397923875432


In [None]:
# retrieve the feature importances
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)


forest_importances = pd.Series(importances, index=feature_names)
forest_importances

# fig, ax = plt.subplots()
# forest_importances.plot.bar(yerr=std, ax=ax)
# ax.set_title("Feature importances using MDI")
# ax.set_ylabel("Mean decrease in impurity")
# fig.tight_layout()