# Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

In [3]:
fasttext_model = api.load("fasttext-wiki-news-subwords-300") # Download pretrained model
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cristian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
CLASS = 'discussion_type'
DATA_SAVE_PATH = f"./data/cleaned_data.csv"
CONTEXT = True
HISTORY = False
USE_TEXT = 'message'

REMOVE_STOPWORDS = False
REMOVE_NUM = True

other_classes_to_predict = ['discussion_type', 'dialogic_spell', 'uptake', 'question', 'pivot']
other_classes_to_predict.remove(CLASS)
remove_also = ['topic']
remove_also += other_classes_to_predict

In [17]:
def preprocess_text(text):
    # Tokenization, lowercasing, removing stopwords, etc.
    tokens = [word.lower() for word in nltk.word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
    
    if REMOVE_NUM:
        tokens = [word for word in tokens if word.isalpha()]

    if REMOVE_STOPWORDS:
        tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

data = pd.read_csv(DATA_SAVE_PATH)
data[USE_TEXT] = data[USE_TEXT].apply(preprocess_text)

if not HISTORY:
    data.drop(columns=['chat_history'], inplace=True)

if not CONTEXT:
    data = data[[USE_TEXT, CLASS]]

data.drop(columns=remove_also, inplace=True)

In [18]:
# Convert text data into numerical vectors using FastText word embeddings
def get_embedding(text):
    # Initialize an empty vector
    vector = np.zeros(300)
    # Iterate over each word in the text
    for word in text.split():
        # If the word is in the FastText vocabulary, add its embedding to the vector
        if word in fasttext_model:
            vector += fasttext_model[word]
        else:
            print(f"Word '{word}' not in vocabulary")
    # Return the vector
    return vector

mess_embeddings = pd.DataFrame(data[USE_TEXT].apply(get_embedding).tolist())

data = pd.concat([data, mess_embeddings], axis=1)

Word 'ashely' not in vocabulary
Word 'orgininally' not in vocabulary
Word 'uwgyeu' not in vocabulary
Word 'kyra' not in vocabulary
Word 'emilie' not in vocabulary
Word 'emilie' not in vocabulary
Word 'kyra' not in vocabulary
Word 'kyra' not in vocabulary
Word 'darla' not in vocabulary
Word 'experien' not in vocabulary
Word 'amswered' not in vocabulary
Word 'sentemce' not in vocabulary
Word 'acce' not in vocabulary
Word 'lillian' not in vocabulary
Word 'semibarbaric' not in vocabulary
Word 'semibarbaric' not in vocabulary
Word 'barberous' not in vocabulary
Word 'discourager' not in vocabulary
Word 'alexandrea' not in vocabulary
Word 'yoooo' not in vocabulary
Word 'wasssupppp' not in vocabulary
Word 'yoooo' not in vocabulary
Word 'wasssupppp' not in vocabulary
Word 'yooo' not in vocabulary
Word 'explainin' not in vocabulary
Word 'admittitly' not in vocabulary
Word 'kniw' not in vocabulary
Word 'vona' not in vocabulary
Word 'alroight' not in vocabulary
Word 'sypher' not in vocabulary
Word

In [19]:
y = data[CLASS].fillna('None')
if CONTEXT:
    data['course'] = LabelEncoder().fit_transform(data['course'])
    data['book_id'] = data['book_id'].astype(int)
    data['bookclub'] = data['bookclub'].astype(int)
    data['chat_crew'] = data['chat_crew'].astype(bool)
    data['pseudonym'] = LabelEncoder().fit_transform(data['pseudonym'])

    data['time'] = pd.to_datetime(data['time'], errors='coerce')

    data['year'] = data['time'].dt.year
    data['month'] = data['time'].dt.month
    data['day'] = data['time'].dt.day
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['second'] = data['time'].dt.second

    data['page'] = data['page'].fillna(0).astype(int)
    data['response_number'] = data['response_number'].fillna(0).astype(float)

    data.drop(columns=['time', 'message', 'chat', CLASS], inplace=True)
else:
    data.drop(columns=['message', 'chat', CLASS], inplace=True)

In [20]:
data

Unnamed: 0,course,book_id,bookclub,chat_crew,pseudonym,is_answer,page,response_number,0,1,...,296,297,298,299,year,month,day,hour,minute,second
0,1,260,1,True,67,False,10,3.1,-0.036086,0.052419,...,0.074865,0.092050,-0.000639,0.029422,2020.0,10.0,20.0,17.0,6.0,0.0
1,1,260,1,True,67,False,10,3.1,-0.012513,0.045208,...,0.018545,-0.095000,-0.009200,-0.000744,2020.0,10.0,20.0,17.0,6.0,0.0
2,1,260,1,True,67,False,10,3.1,0.018834,0.003210,...,0.168975,0.174576,-0.199167,0.009118,2020.0,10.0,20.0,17.0,6.0,0.0
3,1,260,1,True,69,False,10,3.1,0.010507,0.081092,...,0.065781,0.009381,0.084817,-0.059670,2020.0,10.0,27.0,17.0,58.0,0.0
4,1,260,1,True,69,False,10,3.1,0.008774,-0.003281,...,0.277774,-0.101370,-0.169284,0.017724,2020.0,10.0,27.0,17.0,58.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,4,306,7,True,3,False,3,0.0,-0.017933,-0.249814,...,0.068942,0.043535,0.021323,-0.106796,2022.0,3.0,1.0,15.0,21.0,13.0
901,4,306,7,True,20,False,2,0.0,0.060934,0.084670,...,-0.051192,0.125032,0.073620,-0.170250,2022.0,3.0,1.0,15.0,21.0,50.0
902,4,306,7,True,20,False,2,0.0,0.163171,0.091426,...,0.198589,-0.260083,-0.024904,-0.386629,2022.0,3.0,1.0,15.0,22.0,32.0
903,4,306,7,True,3,False,3,0.0,0.076193,0.023702,...,-0.010216,-0.081569,0.051319,0.000576,2022.0,3.0,1.0,15.0,22.0,56.0


In [21]:
# Convert labels to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
# Train an XGBoost classifier
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))
xgb_classifier.fit(X_train, y_train)

In [24]:
# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted', zero_division=0))
print('Recall:', recall_score(y_test, y_pred, average='weighted', zero_division=0))
print('F1:', f1_score(y_test, y_pred, average='weighted', zero_division=0))

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

Accuracy: 0.7348066298342542
Precision: 0.7375789265982636
Recall: 0.7348066298342542
F1: 0.7173026975759826
                   precision    recall  f1-score   support

     Deliberation       0.60      0.64      0.62        45
Imaginative Entry       1.00      0.17      0.29         6
            Other       0.00      0.00      0.00         2
        Procedure       0.86      0.46      0.60        13
          Seminar       0.78      0.92      0.85        89
           Social       0.92      0.69      0.79        16
               UX       0.50      0.40      0.44        10

         accuracy                           0.73       181
        macro avg       0.67      0.47      0.51       181
     weighted avg       0.74      0.73      0.72       181

