In [64]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [37]:
with open("Knjiga.tsv", encoding="utf8") as dat:
    linije_teksta=dat.readlines()
dat.closed
dat.closed
dat.closed

True

In [54]:
## razdvajanje teksta od oznaka
tekst=[]
oznake=[]
for linija in linije_teksta:
    try:
        linija_l=linija.split('\t')
        oznake.append(linija_l[1].rstrip())
        tekst.append(linija_l[0])
    except:
        continue

In [55]:
oznake=[x if x != "p" else "pozitivno" for x in oznake]
print(set(oznake))

{'pozitivno', 'neutralno', 'negativno'}


In [56]:
import pandas as pd

df = pd.DataFrame({"tekst": tekst, "oznake": oznake})
df["oznake"].value_counts()

oznake
neutralno    2709
pozitivno     364
negativno      84
Name: count, dtype: int64

In [70]:
# from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(oznake)
print(le.classes_)
oznake = le.transform(oznake)
print(oznake[:20])

[0 1 2]
[1 1 1 1 1 0 1 1 1 1 0 2 1 0 1 1 1 1 1 1]


## Treniranje modela

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    tekst, oznake, test_size=0.3, random_state=42, stratify=oznake)

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## XGBoost

In [109]:
import xgboost as xgb

# Train the XGBoost model
model = xgb.XGBClassifier(n_jobs=-1)
model.fit(X_train, Y_train)

# Make predictions on the test set
pred = model.predict(X_test)

In [110]:
# Evaluate the model performance
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 86.39%
F1: 39.68%


### Predviđanje

In [114]:
rečenice=["Volim kavu", "Mrzim kavu"]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array([1, 1])

## Pohrana modela

In [83]:
import pickle
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer

model_file_name = "xgb_reg.pkl"
vectorizer_file_name = 'vectorizer.pk'

# save
pickle.dump(model, open(model_file_name, "wb"))
with open(vectorizer_file_name, 'wb') as fin:
    pickle.dump(vectorizer, fin)

# load
xgb_model_učitano = pickle.load(open(model_file_name, "rb"))
vectorizer_učitano = pickle.load(open(vectorizer_file_name, "rb"))

# predict
pred_učitano = xgb_model_učitano.predict(
    vectorizer_učitano.transform(rečenice))
le.inverse_transform(pred_učitano)

array([1, 1])

## SVM

In [169]:
from sklearn import svm
model=svm.SVC(kernel="linear", C=2)
model.fit(X_train, Y_train)

pred=model.predict(X_test)

In [170]:
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 86.08%
F1: 40.19%


## Predviđanje

In [171]:
rečenice=["Volim kavu", "Ne volim kavu"]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array([1, 1])

## KNN

In [196]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, Y_train)

pred=model.predict(X_test)

In [197]:
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 85.86%
F1: 35.46%


### Predviđanje

In [198]:
rečenice=["Volim kavu", "Ne volim kavu"]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array([1, 1])