In [None]:
!pip install xgboost scikit-learn pandas - U


# 1. Preprocessing and Cleaning

In [None]:
texts = []
labels = []
with open('Knjiga.tsv') as input_file:
    for i, line in enumerate(input_file):
        line = line.strip()

        print(i, line, line.rsplit("\t", 2))
        if i == 0:
            continue
        text, label = line.rsplit("\t", 2)
        texts.append(text)
        labels.append(label)
        if i == 230:
            break
        print(i, text, label)


In [None]:
texts


In [None]:
labels


### print unique labels

In [None]:
set(labels)


In [None]:
label_replace_dict = {
    "neutealno": "neutralno",
    "pozitivni": "pozitivno",
    "pozitivnk": "pozitivno"
}


In [None]:
lower_cased_label = [x.lower() for x in labels]
lower_cased_label


In [None]:
labels = [label_replace_dict[x]
          if x in label_replace_dict else x for x in lower_cased_label]


In [None]:
set(labels)


## 1a. just creating dataframe for more details

In [None]:
import pandas as pd

df = pd.DataFrame({"text": texts, "label": labels})
df["label"].value_counts()


In [None]:
df.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(labels)
print(le.classes_)
labels = le.transform(labels)
print(labels)


# 2. Training


In [None]:
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Text data and labels
# texts = [...]
# labels = [...]

# Split the data into train and test sets
texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

# Train the XGBoost model
model = xgb.XGBClassifier(n_jobs=-1)
model.fit(X_train, labels_train)

# Make predictions on the test set
predictions = model.predict(X_test)


In [None]:
# Evaluate the model performance
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))
f1 = f1_score(labels_test, predictions, average='macro')
print("F1-Score: {:.2f}%".format(f1 * 100))


# 3. Predicting on New Text

In [None]:
predict_texts = ["volim kavu", "ne volim kavu"]
X_predict = vectorizer.transform(predict_texts)
predictions = model.predict(X_predict)
le.inverse_transform(predictions)


# 4. Reusing the Model as Inference/Prediction

In [None]:
import pickle
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer

model_file_name = "xgb_reg.pkl"
vectorizer_file_name = 'vectorizer.pk'

# save
pickle.dump(model, open(model_file_name, "wb"))
with open(vectorizer_file_name, 'wb') as fin:
    pickle.dump(vectorizer, fin)

# load
xgb_model_loaded = pickle.load(open(model_file_name, "rb"))
vectorizer_loaded = pickle.load(open(vectorizer_file_name, "rb"))

# predict
predictions_loaded = xgb_model_loaded.predict(
    vectorizer_loaded.transform(predict_texts))
le.inverse_transform(predictions_loaded)
