In [94]:
#!pip install xgboost

In [95]:
with open("/content/drive/MyDrive/Knjiga.tsv", encoding="utf8") as dat:
    linije_teksta=dat.readlines()
dat.closed
dat.closed
dat.closed

True

In [96]:
##from google.colab import drive
##drive.mount('/content/drive')

In [97]:
## razdvajanje teksta od oznaka
tekst=[]
oznake=[]
for linija in linije_teksta:
    try:
        linija_l=linija.split('\t')
        oznake.append(linija_l[1].rstrip())
        tekst.append(linija_l[0])
    except:
        continue

In [98]:
oznake=[x if x != "p" else "pozitivno" for x in oznake]
print(set(oznake))

{'pozitivno', 'negativno', 'neutralno'}


In [99]:
import pandas as pd

df = pd.DataFrame({"tekst": tekst, "oznake": oznake})
df["oznake"].value_counts()

oznake
neutralno    2709
pozitivno     364
negativno      84
Name: count, dtype: int64

## Treniranje modela

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

vectorizer = TfidfVectorizer()
tekst=vectorizer.fit_transform(tekst)

skaliranje = StandardScaler(with_mean=False)
tekst=skaliranje.fit_transform(tekst)
le = LabelEncoder()
le.fit(df["oznake"])
print(dict(zip(le.classes_,le.transform(le.classes_))))
oznake = le.transform(df["oznake"])

# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    tekst, oznake, test_size=0.3, random_state=42, stratify=oznake)

# Convert text data into numerical features using TF-IDF
#vectorizer = TfidfVectorizer()
#X_train = vectorizer.fit_transform(X_train)
#X_test = vectorizer.transform(X_test)

{'negativno': 0, 'neutralno': 1, 'pozitivno': 2}


## XGBoost

In [101]:
import xgboost as xgb

# Train the XGBoost model
model = xgb.XGBClassifier(n_jobs=-1)
model.fit(X_train, Y_train)

# Make predictions on the test set
pred = model.predict(X_test)

In [102]:
# Evaluate the model performance
from sklearn.metrics import accuracy_score, f1_score
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 86.39%
F1: 39.70%


### Predviđanje

In [103]:
rečenice=["Volim kavu", "Ne volim kavu"]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array(['neutralno', 'neutralno'], dtype=object)

## Pohrana modela

In [104]:
import pickle
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer

model_file_name = "xgb_reg.pkl"
vectorizer_file_name = 'vectorizer.pk'

# save
pickle.dump(model, open(model_file_name, "wb"))
with open(vectorizer_file_name, 'wb') as fin:
    pickle.dump(vectorizer, fin)

# load
xgb_model_učitano = pickle.load(open(model_file_name, "rb"))
vectorizer_učitano = pickle.load(open(vectorizer_file_name, "rb"))

# predict
pred_učitano = xgb_model_učitano.predict(
    vectorizer_učitano.transform(rečenice))
le.inverse_transform(pred_učitano)

array(['neutralno', 'neutralno'], dtype=object)

## SVM

In [105]:
from sklearn import svm
model=svm.SVC(kernel="linear", C=2)
model.fit(X_train, Y_train)

pred=model.predict(X_test)

In [106]:
from sklearn.metrics import accuracy_score, f1_score
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 85.76%
F1: 31.38%


### Predviđanje

In [107]:
rečenice=["Volim kavu", "Ne volim kavu"]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array(['neutralno', 'neutralno'], dtype=object)

## KNN

In [108]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, Y_train)

pred=model.predict(X_test)

In [109]:
from sklearn.metrics import accuracy_score, f1_score
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 85.76%
F1: 30.78%


### Predviđanje

In [110]:
rečenice=["Volim kavu", "Doria Russell napisala je tri scenarija, od svega na kraju nije bilo ništa."]
X_vektori=vectorizer.transform(rečenice)
pred=model.predict(X_vektori)
le.inverse_transform(pred)

array(['neutralno', 'neutralno'], dtype=object)

## Naive Bayes

In [111]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(X_train.toarray(), Y_train)

pred=model.predict(X_test.toarray())

In [112]:
from sklearn.metrics import accuracy_score, f1_score
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 76.05%
F1: 33.87%


In [113]:
import tensorflow as tf

model=tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [114]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=["accuracy"])

In [117]:
model.fit(X_train, Y_train, batch_size=32, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7aa572784460>

In [118]:
from sklearn.metrics import accuracy_score, f1_score
točnost = accuracy_score(Y_test, pred)
print("Točnost: {:.2f}%".format(točnost * 100))
f1 = f1_score(Y_test, pred, average='macro')
print("F1: {:.2f}%".format(f1 * 100))

Točnost: 76.05%
F1: 33.87%
