# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hookvan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# SETUP

In [65]:
topics = os.listdir("../data_raw/topics")
df_topics_list = []
for topic in topics:
    files = os.listdir(f"../data_raw/topics/{topic}")
    df_topic = pd.DataFrame(columns=["poem", "labels"])
    i = 0
    for filename in files:
        with open(f"../data_raw/topics/{topic}/{filename}", encoding="utf8") as f:
            df_topic.loc[i] = {"poem": f.read(), "labels": topic}
        i += 1
    df_topics_list.append(df_topic)

df_topics = pd.concat(df_topics_list, ignore_index=True)

In [66]:
stop_words = stopwords.words("english")
df = df_topics
df["poem"] = df["poem"].str.replace("\n", " ").str.lower().str.translate(str.maketrans('', '', string.punctuation + "‘’")).replace("\d+",  "", regex=True)
df["poem"] = df["poem"].apply(lambda poem: " ".join([word for word in poem.split() if word not in stop_words]))
df = df[df["poem"].str.len() > 20].reset_index(drop=True)
indices_to_remove = [3992, 9431, 11216, 12517, 12604]
df = df.drop(indices_to_remove).reset_index(drop=True)

In [67]:
df

Unnamed: 0,poem,labels
0,though watched many mourners weep oer real dea...,hope
1,hope timid friend sat without grated den watch...,hope
2,hope lies tomorrow betrayed yesterday every ne...,hope
3,dont give hope dont give hope still whole slew...,hope
4,remember hope goes long way long little still ...,hope
...,...,...
14288,left dump cafe ankling need hack im johnnie wa...,chicago
14289,got name people say cuz el train tracks went r...,chicago
14290,tidy house dust especially living room forget ...,chicago
14291,bridges chicago bridges paris bridges amsterda...,chicago


In [68]:
filtered_df = df #.groupby('labels').head(20)
filtered_df

Unnamed: 0,poem,labels
0,though watched many mourners weep oer real dea...,hope
1,hope timid friend sat without grated den watch...,hope
2,hope lies tomorrow betrayed yesterday every ne...,hope
3,dont give hope dont give hope still whole slew...,hope
4,remember hope goes long way long little still ...,hope
...,...,...
14288,left dump cafe ankling need hack im johnnie wa...,chicago
14289,got name people say cuz el train tracks went r...,chicago
14290,tidy house dust especially living room forget ...,chicago
14291,bridges chicago bridges paris bridges amsterda...,chicago


In [69]:
np.unique(filtered_df['labels'], return_counts =True)

(array(['alone', 'america', 'angel', 'anger', 'animal', 'baby', 'beach',
        'beautiful', 'beauty', 'believe', 'birth', 'brother', 'butterfly',
        'car', 'carpe diem', 'change', 'chicago', 'childhood', 'children',
        'christmas', 'cinderella', 'city', 'courage', 'crazy', 'culture',
        'dance', 'dark', 'daughter', 'death', 'depression', 'despair',
        'destiny', 'dream', 'evil', 'faith', 'family', 'father', 'fear',
        'fire', 'food', 'football', 'freedom', 'friend', 'frog', 'funeral',
        'funny', 'future', 'girl', 'god', 'graduation', 'greed', 'green',
        'hair', 'happiness', 'happy', 'hate', 'heaven', 'hero', 'home',
        'hope', 'house', 'hunting', 'husband', 'identity', 'innocence',
        'january', 'joy', 'june', 'justice', 'kiss', 'laughter', 'life',
        'lonely', 'loss', 'lost', 'love', 'lust', 'marriage', 'memory',
        'mirror', 'money', 'moon', 'mother', 'murder', 'music', 'nature',
        'night', 'ocean', 'paris', 'passion', 

In [70]:
filtered_df[filtered_df['labels'] == 'love']['poem'].iloc[2]

'hours laying bed almost time get work head im laughing wishing fantasy finally choosing really want every second relive moments enjoying every second heard love forever since smile ever lasted face ever come close every taken place times seems pointless like never meant times wait see still wait hang slim glimpse hope losing something couldnt cope years past seasons came went thoughts sleepless nights spent even day still remember first kiss still love love always miss'

# XGB Classifier

In [71]:
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(filtered_df['poem'])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
X = tfidf_vectorizer.fit_transform(filtered_df['poem'])


# encoding labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(filtered_df['labels'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [None]:
new_poem = ["I miss you "]

In [None]:
new_poem_transformed = tfidf_vectorizer.transform(new_poem)
predicted_label = model.predict(new_poem_transformed)

predicted_label_decoded = label_encoder.inverse_transform(predicted_label)
print("Predicted Label:", predicted_label_decoded[0])


In [None]:
# Get probs of all labels ( for multilabel?)

probabilities = model.predict_proba(new_poem_transformed)
labels = label_encoder.classes_

prob_dict = {label: prob for label, prob in zip(labels, probabilities[0])}

sorted_probabilities = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)

# Print probabilities for each label in descending order
for label, probability in sorted_probabilities:
    print(f"Probability of '{label}': {probability:.4f}")

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# XGBOOST + BERT

In [64]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = filtered_df.groupby('labels').head(2)  
X = data['poem']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['labels'])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128 

tokenized_texts = [tokenizer.tokenize(text)[:max_length] for text in X]
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
input_ids = [ids[:max_length] + [0] * (max_length - len(ids)) for ids in input_ids]  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()

bert_embeddings = []
batch_size = 8

with torch.no_grad():
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = torch.tensor(input_ids[i:i+batch_size]).to(device)
        outputs = model(batch_input_ids)
        bert_embeddings.extend(outputs[0][:, 0, :].cpu().numpy()) 

X_features = np.array(bert_embeddings)
print(X_features)
print(y)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'max_depth': 6,
    'eta': 0.3,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

# Predictions
y_pred = xgb_model.predict(dtest)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[ 0.295126   -0.05824259  0.42334542 ... -0.50325745  0.32414806
  -0.1545983 ]
 [ 0.11506277 -0.17986692  0.23144366 ... -0.5288107   0.31901187
  -0.22553356]
 [ 0.07985287 -0.32657608 -0.17349988 ... -0.30127302  0.45985243
  -0.19309199]
 ...
 [-0.07277451 -0.27822712 -0.08247866 ... -0.3087592   1.006625
  -0.27042916]
 [-0.19649386 -0.30289686 -0.01229891 ... -0.43488362  0.68383104
   0.07407475]
 [-0.25862762 -0.33636153  0.02183051 ... -0.37830243  0.90625334
  -0.06640918]]
[ 59  59 133 133  45  45  67  67  50  50  14  14 102 102  22  22  20  20
  99  99 106 106 100 100 114 114  51  51  52  52  65  65 126 126 109 109
 131 131  43  43   4   4  77  77  13  13  34  34  29  29   0   0  37  37
 138 138 110 110 121 121  58  58  62  62   9   9  91  91  90  90 113 113
 132 132 142 142  44  44  98  98  28  28 136 136  83  83  68  68  18  18
  69  69   3   3   5   5  40  40  26  26  10  10   7   7 120 120  76  76
  49  49  75  75  54  54 122 122  63  63 130 130  57  57  92  92  60  60

# XGBOOST test