# Preprocessing

In [130]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

import warnings
warnings.filterwarnings('ignore')

In [77]:
fasttext_model = api.load("fasttext-wiki-news-subwords-300") # Download pretrained model
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [215]:
target_columns = ['r2_discussion_type', 'r2_dialogic_spell', 'r2_uptake', 'r2_question', 'from_pivot', 'to_pivot']
DATA_SAVE_PATH = "./cleaned_data/to_pivot_data_all.csv"
TARGET_COLUMN = target_columns[5]

In [216]:
def preprocess_text(text):
    # Tokenization, lowercasing, removing stopwords, etc.
    tokens = [word.lower() for word in nltk.word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(tokens)

data = pd.read_csv(DATA_SAVE_PATH)
data['message'] = data['message'].apply(preprocess_text)

In [226]:
data.head()

Unnamed: 0,course,book_id,bookclub,chat_crew,pseudonym,is_answer,page,response_number,0,1,...,296,297,298,299,year,month,day,hour,minute,second
0,1,260,1,True,46,False,10,3.1,-0.036086,0.052419,...,0.074865,0.09205,-0.000639,0.029422,2020.0,10.0,20.0,17.0,6.0,0.0
1,1,260,1,True,46,False,10,3.1,-0.012513,0.045208,...,0.018545,-0.095,-0.0092,-0.000744,2020.0,10.0,20.0,17.0,6.0,0.0
2,1,260,1,True,46,False,10,3.1,0.018834,0.00321,...,0.168975,0.174576,-0.199167,0.009118,2020.0,10.0,20.0,17.0,6.0,0.0
3,1,260,1,True,48,False,10,3.1,0.010507,0.081092,...,0.065781,0.009381,0.084817,-0.05967,2020.0,10.0,27.0,17.0,58.0,0.0
4,1,260,1,True,48,False,10,3.1,0.008774,-0.003281,...,0.277774,-0.10137,-0.169284,0.017724,2020.0,10.0,27.0,17.0,58.0,0.0


In [218]:
# Convert text data into numerical vectors using FastText word embeddings
def get_embedding(text):
    # Initialize an empty vector
    vector = np.zeros(300)
    # Iterate over each word in the text
    for word in text.split():
        # If the word is in the FastText vocabulary, add its embedding to the vector
        if word in fasttext_model:
            vector += fasttext_model[word]
        else:
            print(f"Word '{word}' not in vocabulary")
    # Return the vector
    return vector

mess_embeddings = pd.DataFrame(data['message'].apply(get_embedding).tolist())

data = pd.concat([data, mess_embeddings], axis=1)

Word 'ashely' not in vocabulary
Word 'orgininally' not in vocabulary
Word 'uwgyeu' not in vocabulary
Word 'kyra' not in vocabulary
Word 'emilie' not in vocabulary
Word 'emilie' not in vocabulary
Word 'kyra' not in vocabulary
Word 'kyra' not in vocabulary
Word 'darla' not in vocabulary
Word 'experien' not in vocabulary
Word 'amswered' not in vocabulary
Word 'sentemce' not in vocabulary
Word 'acce' not in vocabulary
Word 'lillian' not in vocabulary
Word 'semibarbaric' not in vocabulary
Word 'semibarbaric' not in vocabulary
Word 'barberous' not in vocabulary
Word 'discourager' not in vocabulary
Word 'alexandrea' not in vocabulary


In [219]:
y = data[TARGET_COLUMN]
data['course'] = LabelEncoder().fit_transform(data['course'])
data['book_id'] = data['book_id'].astype(int)
data['bookclub'] = data['bookclub'].astype(int)
data['chat_crew'] = data['chat_crew'].astype(bool)
data['pseudonym'] = LabelEncoder().fit_transform(data['pseudonym'])

data['time'] = pd.to_datetime(data['time'], errors='coerce')

data['year'] = data['time'].dt.year
data['month'] = data['time'].dt.month
data['day'] = data['time'].dt.day
data['hour'] = data['time'].dt.hour
data['minute'] = data['time'].dt.minute
data['second'] = data['time'].dt.second

data['page'] = data['page'].fillna(0).astype(int)
data['response_number'] = data['response_number'].fillna(0).astype(float)

In [220]:
data.drop(columns=['time', 'message', TARGET_COLUMN], inplace=True)

In [227]:
data.head()

Unnamed: 0,course,book_id,bookclub,chat_crew,pseudonym,is_answer,page,response_number,0,1,...,296,297,298,299,year,month,day,hour,minute,second
0,1,260,1,True,46,False,10,3.1,-0.036086,0.052419,...,0.074865,0.09205,-0.000639,0.029422,2020.0,10.0,20.0,17.0,6.0,0.0
1,1,260,1,True,46,False,10,3.1,-0.012513,0.045208,...,0.018545,-0.095,-0.0092,-0.000744,2020.0,10.0,20.0,17.0,6.0,0.0
2,1,260,1,True,46,False,10,3.1,0.018834,0.00321,...,0.168975,0.174576,-0.199167,0.009118,2020.0,10.0,20.0,17.0,6.0,0.0
3,1,260,1,True,48,False,10,3.1,0.010507,0.081092,...,0.065781,0.009381,0.084817,-0.05967,2020.0,10.0,27.0,17.0,58.0,0.0
4,1,260,1,True,48,False,10,3.1,0.008774,-0.003281,...,0.277774,-0.10137,-0.169284,0.017724,2020.0,10.0,27.0,17.0,58.0,0.0


In [222]:
# Convert labels to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [223]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=1, stratify=y)

In [225]:
# Train an XGBoost classifier
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))
xgb_classifier.fit(X_train, y_train)

In [214]:
# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
print(f'Predictions made for -> {TARGET_COLUMN}')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1:', f1_score(y_test, y_pred, average='weighted'))

# Convert label_encoder.classes_ to strings
target_names = [str(label) for label in label_encoder.classes_]

print(classification_report(y_test, y_pred, target_names=target_names))

Predictions made for -> from_pivot
Accuracy: 0.9193548387096774
Precision: 0.8452133194588969
Recall: 0.9193548387096774
F1: 0.8807264841420439
                     precision    recall  f1-score   support

       Deliberation       0.00      0.00      0.00         2
  Imaginative Entry       0.00      0.00      0.00         1
            Seminar       0.00      0.00      0.00         4
Social/Procedure/UX       0.00      0.00      0.00         3
                nan       0.92      1.00      0.96       114

           accuracy                           0.92       124
          macro avg       0.18      0.20      0.19       124
       weighted avg       0.85      0.92      0.88       124



# Scores by Target

### Discussion Type

|       Class         | Precision |  Recall  | F1-Score |
|---------------------|-----------|----------|----------|
| Deliberation        |   0.82    |   0.50   |   0.62   |
| Imaginative Entry   |   0.00    |   0.00   |   0.00   |
| Other               |   0.00    |   0.00   |   0.00   |
| Procedure           |   0.75    |   0.67   |   0.71   |
| Seminar             |   0.76    |   0.97   |   0.85   |
| Social              |   0.82    |   0.64   |   0.72   |
| UX                  |   0.50    |   0.30   |   0.37   |

- **Accuracy**: 0.7419
- **Macro Avg Precision**: 0.52
- **Macro Avg Recall**: 0.44
- **Macro Avg F1-Score**: 0.47
- **Weighted Avg Precision**: 0.71
- **Weighted Avg Recall**: 0.74
- **Weighted Avg F1-Score**: 0.71


### Dialogic Spell

|   Class   | Precision | Recall | F1-Score | Support |
|-----------|-----------|--------|----------|---------|
|     1   |   0.65    |  0.70  |   0.67   |    40   |
|     2   |   0.59    |  0.62  |   0.60   |    26   |
|     3  |   0.57    |  0.31  |   0.40   |    13   |
|     4  |   0.00    |  0.00  |   0.00   |    6    |
|     5  |   1.00    |  0.33  |   0.50   |    3    |
|     6   |   0.00    |  0.00  |   0.00   |    2    |
|     7   |   0.67    |  1.00  |   0.80   |    4    |
|    nan    |   0.55    |  0.73  |   0.63   |    30   |

- **Accuracy**: 0.6048
- **Macro Avg Precision**: 0.50
- **Macro Avg Recall**: 0.46
- **Macro Avg F1-Score**: 0.45
- **Weighted Avg Precision**: 0.57
- **Weighted Avg Recall**: 0.60
- **Weighted Avg F1-Score**: 0.58


### Uptake

|   Class   | Precision | Recall | F1-Score | Support |
|-----------|-----------|--------|----------|---------|
|   Affirm  |    0.50   |  0.46  |   0.48   |    28   |
|  Clarify  |    0.33   |  0.09  |   0.14   |    11   |
|  Disagree |    0.00   |  0.00  |   0.00   |    2    |
| Elaborate |    0.27   |  0.18  |   0.22   |    22   |
|   Filler  |    0.50   |  0.25  |   0.33   |    12   |
|    nan    |    0.53   |  0.80  |   0.63   |    49   |

- **Accuracy**: 0.4839
- **Macro Avg Precision**: 0.35
- **Macro Avg Recall**: 0.30
- **Macro Avg F1-Score**: 0.30
- **Weighted Avg Precision**: 0.45
- **Weighted Avg Recall**: 0.48
- **Weighted Avg F1-Score**: 0.44


### Question

|   Class   | Precision | Recall | F1-Score | Support |
|-----------|-----------|--------|----------|---------|
|   C-HOT   |    0.00   |  0.00  |   0.00   |    4    |
|   C-LOT   |    0.00   |  0.00  |   0.00   |    9    |
|   O-HOT   |    1.00   |  0.50  |   0.67   |    2    |
|   O-LOT   |    0.00   |  0.00  |   0.00   |    2    |
|    nan    |    0.87   |  1.00  |   0.93   |   107   |

- **Accuracy**: 0.871
- **Macro Avg Precision**: 0.37
- **Macro Avg Recall**: 0.30
- **Macro Avg F1-Score**: 0.32
- **Weighted Avg Precision**: 0.767
- **Weighted Avg Recall**: 0.871
- **Weighted Avg F1-Score**: 0.814


### From Pivot

|       Class        | Precision | Recall | F1-Score | Support |
|---------------------|-----------|--------|----------|---------|
|     Deliberation   |    0.00   |  0.00  |   0.00   |    2    |
|  Imaginative Entry |    0.00   |  0.00  |   0.00   |    1    |
|       Seminar      |    0.00   |  0.00  |   0.00   |    4    |
| Social/Procedure/UX|    0.00   |  0.00  |   0.00   |    3    |
|         nan         |    0.92   |  1.00  |   0.96   |   114   |

- **Accuracy**: 0.919
- **Macro Avg Precision**: 0.18
- **Macro Avg Recall**: 0.20
- **Macro Avg F1-Score**: 0.19
- **Weighted Avg Precision**: 0.845
- **Weighted Avg Recall**: 0.919
- **Weighted Avg F1-Score**: 0.881


### To Pivot

|       Class        | Precision | Recall | F1-Score | Support |
|---------------------|-----------|--------|----------|---------|
|     Deliberation   |    0.00   |  0.00  |   0.00   |    2    |
|  Imaginative Entry |    0.00   |  0.00  |   0.00   |    1    |
|       Seminar      |    0.00   |  0.00  |   0.00   |    4    |
| Social/Procedure/UX|    0.00   |  0.00  |   0.00   |    3    |
|         nan         |    0.92   |  1.00  |   0.96   |   114   |

- **Accuracy**: 0.919
- **Macro Avg Precision**: 0.18
- **Macro Avg Recall**: 0.20
- **Macro Avg F1-Score**: 0.19
- **Weighted Avg Precision**: 0.845
- **Weighted Avg Recall**: 0.919
- **Weighted Avg F1-Score**: 0.881
