In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from gensim.models import Word2Vec


In [2]:
df = pd.read_csv("final_merged.csv")

In [3]:
# Grupa 1: Pozitivne emocije
positive_emotions = [
    'cheerfulness', 'joy', 'contentment', 'love', 'warmth',
    'positive_emotion', 'fun', 'giving', 'friends'
]

# Grupa 2: Negativne emocije
negative_emotions = [
    'sadness', 'disgust', 'suffering', 'negative_emotion',
    'weakness', 'neglect'
]

# Grupa 3: Socijalne emocije
social_emotions = [
    'pride', 'shame', 'politeness', 'affection', 'leader',
    'dominant_personality', 'childish', 'trust', 'sympathy'
]

# Grupa 4: Intenzivne emocije
intense_emotions = [
    'surprise', 'rage', 'horror', 'fear', 'exasperation',
    'nervousness', 'irritability', 'torment', 'pain', 'hate', 'anger'
]

# Grupa 5: Kognitivno-emotivne emocije
cognitive_emotions = [
    'anticipation', 'confusion', 'envy', 'disappointment',
    'optimism', 'zest', 'achievement'
]

# Kreiranje novih kolona kao zbir postojećih
df['emotion_positive'] = df[[f'empath_result.{x}' for x in positive_emotions]].sum(axis=1)
df['emotion_negative'] = df[[f'empath_result.{x}' for x in negative_emotions]].sum(axis=1)
df['emotion_social']   = df[[f'empath_result.{x}' for x in social_emotions]].sum(axis=1)
df['emotion_intense']  = df[[f'empath_result.{x}' for x in intense_emotions]].sum(axis=1)
df['emotion_cognitive'] = df[[f'empath_result.{x}' for x in cognitive_emotions]].sum(axis=1)


In [4]:
feature_cols_users = ['followers_count', 'favourites_count','friends_count',
       'statuses_count', 'listed_count', 'cred','BotScore',
       'normalize_influence']

In [5]:
feature_cols_empath = ['emotion_positive', 'emotion_negative',
       'emotion_social', 'emotion_intense', 'emotion_cognitive']

In [6]:

df['tweet_tokens'] = df['tweet_tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# TF-IDF + User/Empath Features

In [None]:
from scipy.sparse import hstack


vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['tweet_new_x'])

X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values
X_users_empath = df[feature_cols_users + feature_cols_empath].values

X_tfidf_users = hstack([X_text, X_users])
X_tfidf_empath = hstack([X_text, X_empath])
X_tfidf_users_empath = hstack([X_text, X_users_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_users, X_test_users, y_train_users, y_test_users = train_test_split(X_tfidf_users, y, test_size=0.2, random_state=1)
X_train_empath, X_test_empath, y_train_empath, y_test_empath = train_test_split(X_tfidf_empath, y, test_size=0.2, random_state=1)
X_train_users_empath, X_test_users_empath, y_train_users_empath, y_test_users_empath = train_test_split(X_tfidf_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

def train_and_evaluate(X_train, X_test, y_train, y_test, title):
    print(f"\nRezultati za: {title}")
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = metrics.precision_score(y_test, y_pred)
        rec = metrics.recall_score(y_test, y_pred)
        macro_f1 = metrics.f1_score(y_test, y_pred, average='macro')
        weighted_f1 = metrics.f1_score(y_test, y_pred, average='weighted')

        results[name] = (acc, prec, rec, macro_f1, weighted_f1)
        print(f"{name}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1 Macro={macro_f1:.4f}, F1 Weighted={weighted_f1:.4f}")

        report = classification_report(y_test, y_pred, output_dict=True)
        df_report = pd.DataFrame(report).transpose()

        print("Klasa 0:", round(df_report.loc['0', 'f1-score'], 3))
        print("Klasa 1:", round(df_report.loc['1', 'f1-score'], 3))
    return results




In [8]:

results_users = train_and_evaluate(X_train_users, X_test_users, y_train_users, y_test_users, "TF-IDF + User features")



Rezultati za: TF-IDF + User features
Random Forest: Accuracy=0.9728, Precision=0.9753, Recall=0.9720, F1 Macro=0.9728, F1 Weighted=0.9728
Klasa 0: 0.972
Klasa 1: 0.974
Decision Tree: Accuracy=0.9602, Precision=0.9619, Recall=0.9609, F1 Macro=0.9601, F1 Weighted=0.9602
Klasa 0: 0.959
Klasa 1: 0.961
Naive Bayes: Accuracy=0.9356, Precision=0.9319, Recall=0.9443, F1 Macro=0.9355, F1 Weighted=0.9356
Klasa 0: 0.933
Klasa 1: 0.938
KNN: Accuracy=0.6952, Precision=0.7036, Recall=0.7072, F1 Macro=0.6949, F1 Weighted=0.6952
Klasa 0: 0.684
Klasa 1: 0.705
SVM: Accuracy=0.9183, Precision=0.8936, Recall=0.9554, F1 Macro=0.9179, F1 Weighted=0.9181
Klasa 0: 0.912
Klasa 1: 0.923


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9628, Precision=0.9501, Recall=0.9794, F1 Macro=0.9627, F1 Weighted=0.9628
Klasa 0: 0.961
Klasa 1: 0.965
Logistic Regression: Accuracy=0.9749, Precision=0.9727, Recall=0.9788, F1 Macro=0.9749, F1 Weighted=0.9749
Klasa 0: 0.974
Klasa 1: 0.976


In [9]:

results_empath = train_and_evaluate(X_train_empath, X_test_empath, y_train_empath, y_test_empath, "TF-IDF + Empath features")



Rezultati za: TF-IDF + Empath features
Random Forest: Accuracy=0.9781, Precision=0.9802, Recall=0.9772, F1 Macro=0.9780, F1 Weighted=0.9781
Klasa 0: 0.977
Klasa 1: 0.979
Decision Tree: Accuracy=0.9679, Precision=0.9669, Recall=0.9710, F1 Macro=0.9679, F1 Weighted=0.9679
Klasa 0: 0.967
Klasa 1: 0.969
Naive Bayes: Accuracy=0.9453, Precision=0.9387, Recall=0.9565, F1 Macro=0.9452, F1 Weighted=0.9453
Klasa 0: 0.943
Klasa 1: 0.948
KNN: Accuracy=0.7303, Precision=0.6569, Recall=0.9994, F1 Macro=0.7034, F1 Weighted=0.7063
Klasa 0: 0.614
Klasa 1: 0.793
SVM: Accuracy=0.9890, Precision=0.9873, Recall=0.9914, F1 Macro=0.9890, F1 Weighted=0.9890
Klasa 0: 0.989
Klasa 1: 0.989


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9624, Precision=0.9501, Recall=0.9785, F1 Macro=0.9623, F1 Weighted=0.9624
Klasa 0: 0.961
Klasa 1: 0.964
Logistic Regression: Accuracy=0.9744, Precision=0.9726, Recall=0.9780, F1 Macro=0.9744, F1 Weighted=0.9744
Klasa 0: 0.973
Klasa 1: 0.975


In [10]:

results_users_empath = train_and_evaluate(X_train_users_empath, X_test_users_empath, y_train_users_empath, y_test_users_empath, "TF-IDF + User + Empath features")


Rezultati za: TF-IDF + User + Empath features
Random Forest: Accuracy=0.9727, Precision=0.9735, Recall=0.9736, F1 Macro=0.9727, F1 Weighted=0.9727
Klasa 0: 0.972
Klasa 1: 0.974
Decision Tree: Accuracy=0.9599, Precision=0.9620, Recall=0.9603, F1 Macro=0.9599, F1 Weighted=0.9599
Klasa 0: 0.959
Klasa 1: 0.961
Naive Bayes: Accuracy=0.9356, Precision=0.9319, Recall=0.9442, F1 Macro=0.9355, F1 Weighted=0.9356
Klasa 0: 0.933
Klasa 1: 0.938
KNN: Accuracy=0.6963, Precision=0.7046, Recall=0.7086, F1 Macro=0.6959, F1 Weighted=0.6963
Klasa 0: 0.685
Klasa 1: 0.707
SVM: Accuracy=0.9178, Precision=0.8938, Recall=0.9542, F1 Macro=0.9175, F1 Weighted=0.9177
Klasa 0: 0.912
Klasa 1: 0.923


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9614, Precision=0.9486, Recall=0.9782, F1 Macro=0.9613, F1 Weighted=0.9614
Klasa 0: 0.959
Klasa 1: 0.963
Logistic Regression: Accuracy=0.9750, Precision=0.9729, Recall=0.9788, F1 Macro=0.9750, F1 Weighted=0.9750
Klasa 0: 0.974
Klasa 1: 0.976


# Word2Vec + User/Empath

In [None]:
sentences = df['tweet_tokens'].tolist()

w2v_model = Word2Vec(sentences, vector_size=300, window=10, min_count=5, workers=4, sg=1, negative=10)

def vectorize_tweet(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vectors, axis=0)

X_w2v = np.array([vectorize_tweet(tokens) for tokens in df['tweet_tokens']])
X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values

X_w2v_users = np.hstack([X_w2v, X_users])
X_w2v_empath = np.hstack([X_w2v, X_empath])
X_w2v_users_empath = np.hstack([X_w2v, X_users, X_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_w2v_users, X_test_w2v_users, y_train_w2v_users, y_test_w2v_users = train_test_split(X_w2v_users, y, test_size=0.2, random_state=1)
X_train_w2v_empath, X_test_w2v_empath, y_train_w2v_empath, y_test_w2v_empath = train_test_split(X_w2v_empath, y, test_size=0.2, random_state=1)
X_train_w2v_users_empath, X_test_w2v_users_empath, y_train_w2v_users_empath, y_test_w2v_users_empath = train_test_split(X_w2v_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(), 
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}


In [16]:
train_and_evaluate(X_train_w2v_users, X_test_w2v_users, y_train_w2v_users, y_test_w2v_users, "Word2Vec + User")



Rezultati za: Word2Vec + User
Random Forest: Accuracy=0.9565, Precision=0.9518, Recall=0.9645, F1 Macro=0.9564, F1 Weighted=0.9565
Klasa 0: 0.955
Klasa 1: 0.958
Decision Tree: Accuracy=0.8415, Precision=0.8470, Recall=0.8456, F1 Macro=0.8413, F1 Weighted=0.8415
Klasa 0: 0.836
Klasa 1: 0.846
Naive Bayes: Accuracy=0.7899, Precision=0.8106, Recall=0.7736, F1 Macro=0.7899, F1 Weighted=0.7899
Klasa 0: 0.788
Klasa 1: 0.792
KNN: Accuracy=0.8254, Precision=0.8392, Recall=0.8184, F1 Macro=0.8253, F1 Weighted=0.8254
Klasa 0: 0.822
Klasa 1: 0.829
SVM: Accuracy=0.8887, Precision=0.8834, Recall=0.9035, F1 Macro=0.8885, F1 Weighted=0.8886
Klasa 0: 0.884
Klasa 1: 0.893


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9654, Precision=0.9644, Recall=0.9687, F1 Macro=0.9654, F1 Weighted=0.9654
Klasa 0: 0.964
Klasa 1: 0.967
Logistic Regression: Accuracy=0.9066, Precision=0.9072, Recall=0.9123, F1 Macro=0.9065, F1 Weighted=0.9066
Klasa 0: 0.903
Klasa 1: 0.91


{'Random Forest': (0.9564812399865866,
  0.9517634485215533,
  0.9645461766192505,
  0.9564151652677509,
  0.9564694793189279),
 'Decision Tree': (0.8414993107045717,
  0.8469660808562957,
  0.8456206224276122,
  0.8413450085482101,
  0.84150336643628),
 'Naive Bayes': (0.7898953016133239,
  0.8105613557270389,
  0.7736298649722002,
  0.7898801379497538,
  0.789937267717327),
 'KNN': (0.8254033309735832,
  0.839244724176231,
  0.8183984403206007,
  0.825339021095346,
  0.8254462874747749),
 'SVM': (0.8887067327396699,
  0.8834368822366563,
  0.9035309408621561,
  0.8884933770660555,
  0.8886494865410425),
 'XGBoost': (0.965423450948247,
  0.9644166486952771,
  0.9687342046357138,
  0.9653826866510717,
  0.9654207067718898),
 'Logistic Regression': (0.9065911546629905,
  0.9072238977452247,
  0.9122680337930537,
  0.9064774088110275,
  0.9065817973653396)}

In [17]:

train_and_evaluate(X_train_w2v_empath, X_test_w2v_empath, y_train_w2v_empath, y_test_w2v_empath, "Word2Vec + Empath")



Rezultati za: Word2Vec + Empath
Random Forest: Accuracy=0.9576, Precision=0.9540, Recall=0.9644, F1 Macro=0.9576, F1 Weighted=0.9576
Klasa 0: 0.956
Klasa 1: 0.959
Decision Tree: Accuracy=0.8418, Precision=0.8485, Recall=0.8441, F1 Macro=0.8417, F1 Weighted=0.8418
Klasa 0: 0.837
Klasa 1: 0.846
Naive Bayes: Accuracy=0.7880, Precision=0.8068, Recall=0.7747, F1 Macro=0.7880, F1 Weighted=0.7880
Klasa 0: 0.786
Klasa 1: 0.79
KNN: Accuracy=0.9873, Precision=0.9877, Recall=0.9877, F1 Macro=0.9873, F1 Weighted=0.9873
Klasa 0: 0.987
Klasa 1: 0.988
SVM: Accuracy=0.9730, Precision=0.9713, Recall=0.9765, F1 Macro=0.9730, F1 Weighted=0.9730
Klasa 0: 0.972
Klasa 1: 0.974


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9670, Precision=0.9662, Recall=0.9700, F1 Macro=0.9670, F1 Weighted=0.9670
Klasa 0: 0.966
Klasa 1: 0.968
Logistic Regression: Accuracy=0.9064, Precision=0.9067, Recall=0.9125, F1 Macro=0.9063, F1 Weighted=0.9064
Klasa 0: 0.903
Klasa 1: 0.91


{'Random Forest': (0.9576362755691344,
  0.954,
  0.9644017618600621,
  0.9575761969856464,
  0.9576272935234842),
 'Decision Tree': (0.8417973844032938,
  0.8485156420120491,
  0.844104267456134,
  0.8416612386662947,
  0.8418098399217994),
 'Naive Bayes': (0.787995081783971,
  0.8067523873975487,
  0.7747129756661131,
  0.7879669765112134,
  0.7880451073503681),
 'KNN': (0.9872946085919744,
  0.9877238590410168,
  0.9876525380893927,
  0.9872816106668623,
  0.9872946237410711),
 'SVM': (0.9729870710533179,
  0.9712726228095375,
  0.9765326016318868,
  0.9729543174535458,
  0.9729844409462698),
 'XGBoost': (0.9669883378665375,
  0.9661943465439113,
  0.969961730088815,
  0.9669500924722063,
  0.9669860758169275),
 'Logistic Regression': (0.9064048586012892,
  0.9067231111430006,
  0.9124846559318363,
  0.9062881879471014,
  0.9063940169776689)}

In [18]:

train_and_evaluate(X_train_w2v_users_empath, X_test_w2v_users_empath, y_train_w2v_users_empath, y_test_w2v_users_empath, "Word2Vec + User + Empath")


Rezultati za: Word2Vec + User + Empath
Random Forest: Accuracy=0.9564, Precision=0.9532, Recall=0.9628, F1 Macro=0.9563, F1 Weighted=0.9564
Klasa 0: 0.955
Klasa 1: 0.958
Decision Tree: Accuracy=0.8414, Precision=0.8475, Recall=0.8446, F1 Macro=0.8412, F1 Weighted=0.8414
Klasa 0: 0.836
Klasa 1: 0.846
Naive Bayes: Accuracy=0.7903, Precision=0.8103, Recall=0.7752, F1 Macro=0.7903, F1 Weighted=0.7904
Klasa 0: 0.788
Klasa 1: 0.792
KNN: Accuracy=0.8263, Precision=0.8398, Recall=0.8198, F1 Macro=0.8262, F1 Weighted=0.8263
Klasa 0: 0.823
Klasa 1: 0.83
SVM: Accuracy=0.8888, Precision=0.8838, Recall=0.9032, F1 Macro=0.8886, F1 Weighted=0.8887
Klasa 0: 0.884
Klasa 1: 0.893


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9655, Precision=0.9651, Recall=0.9682, F1 Macro=0.9655, F1 Weighted=0.9655
Klasa 0: 0.964
Klasa 1: 0.967
Logistic Regression: Accuracy=0.9067, Precision=0.9078, Recall=0.9118, F1 Macro=0.9066, F1 Weighted=0.9067
Klasa 0: 0.903
Klasa 1: 0.91


{'Random Forest': (0.9564067215619062,
  0.9531774966044749,
  0.9628131995089898,
  0.9563462404219727,
  0.9563982457264801),
 'Decision Tree': (0.841387533067551,
  0.8474858716128097,
  0.8446097191132934,
  0.8412422167365702,
  0.841395944234492),
 'Naive Bayes': (0.7903424121614069,
  0.8102641509433962,
  0.7752184273232724,
  0.7903227348672184,
  0.7903877456199414),
 'KNN': (0.8262975520697492,
  0.839781048894149,
  0.8197703805328904,
  0.8262300443601247,
  0.8263396646296285),
 'SVM': (0.8887812511643504,
  0.8837784371909001,
  0.9032421113437793,
  0.8885716619387107,
  0.8887263330253297),
 'XGBoost': (0.9654979693729274,
  0.9650903332613546,
  0.9681565455989602,
  0.9654588691357231,
  0.9654960641564967),
 'Logistic Regression': (0.9067401915123514,
  0.9078360891445003,
  0.9118347895154885,
  0.906630481032293,
  0.9067329172305214)}

# FastText + User/Empath

In [None]:
from gensim.models import FastText


sentences = df['tweet_tokens'].tolist()

fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, sg=1)
fasttext_model.save('fasttext_model.model')

def get_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

X_fasttext = np.array([get_vector(tokens, fasttext_model) for tokens in df['tweet_tokens']])
X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values

X_fasttext_users = np.hstack([X_fasttext, X_users])
X_fasttext_empath = np.hstack([X_fasttext, X_empath])
X_fasttext_users_empath = np.hstack([X_fasttext, X_users, X_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_ft_users, X_test_ft_users, y_train_ft_users, y_test_ft_users = train_test_split(X_fasttext_users, y, test_size=0.2, random_state=1)
X_train_ft_empath, X_test_ft_empath, y_train_ft_empath, y_test_ft_empath = train_test_split(X_fasttext_empath, y, test_size=0.2, random_state=1)
X_train_ft_users_empath, X_test_ft_users_empath, y_train_ft_users_empath, y_test_ft_users_empath = train_test_split(X_fasttext_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='linear'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB()
}

In [20]:

train_and_evaluate(X_train_ft_users, X_test_ft_users, y_train_ft_users, y_test_ft_users, "FastText + User ")




Rezultati za: FastText + User 
Decision Tree: Accuracy=0.8340, Precision=0.8433, Recall=0.8331, F1 Macro=0.8339, F1 Weighted=0.8340
Klasa 0: 0.83
Klasa 1: 0.838
Random Forest: Accuracy=0.9377, Precision=0.9392, Recall=0.9402, F1 Macro=0.9377, F1 Weighted=0.9377
Klasa 0: 0.936
Klasa 1: 0.94
KNN: Accuracy=0.8035, Precision=0.8188, Recall=0.7951, F1 Macro=0.8034, F1 Weighted=0.8035
Klasa 0: 0.8
Klasa 1: 0.807
SVM: Accuracy=0.7943, Precision=0.7927, Recall=0.8142, F1 Macro=0.7938, F1 Weighted=0.7941
Klasa 0: 0.784
Klasa 1: 0.803


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9395, Precision=0.9364, Recall=0.9470, F1 Macro=0.9394, F1 Weighted=0.9394
Klasa 0: 0.937
Klasa 1: 0.942
Logistic Regression: Accuracy=0.7911, Precision=0.7923, Recall=0.8066, F1 Macro=0.7907, F1 Weighted=0.7910
Klasa 0: 0.782
Klasa 1: 0.799
Naive Bayes: Accuracy=0.7507, Precision=0.7672, Recall=0.7419, F1 Macro=0.7506, F1 Weighted=0.7507
Klasa 0: 0.747
Klasa 1: 0.754


{'Decision Tree': (0.8340102090241812,
  0.8433479532163742,
  0.8330565383782222,
  0.8339004262910586,
  0.8340370978385259),
 'Random Forest': (0.9377398561794403,
  0.9391950375072129,
  0.9402122896960069,
  0.9376737623194482,
  0.9377387218454587),
 'KNN': (0.8034576549051753,
  0.8187834622248662,
  0.7950754567116759,
  0.8034004041900352,
  0.8035077805313133),
 'SVM': (0.7942546294571333,
  0.792688927943761,
  0.8142104123041375,
  0.7938180614833894,
  0.7941217148351676),
 'XGBoost': (0.9394537799470919,
  0.9363844066828502,
  0.9469997833778612,
  0.9393668909883337,
  0.9394403532083861),
 'Logistic Regression': (0.7910503371958717,
  0.7922547698418327,
  0.8065564300671528,
  0.7906929703606458,
  0.7909697765928001),
 'Naive Bayes': (0.7506613510190394,
  0.7671918166206227,
  0.7419308253303487,
  0.7506051214231352,
  0.7507249755742065)}

In [21]:

train_and_evaluate(X_train_ft_empath, X_test_ft_empath, y_train_ft_empath, y_test_ft_empath, "FastText + Empath")




Rezultati za: FastText + Empath
Decision Tree: Accuracy=0.8363, Precision=0.8442, Recall=0.8373, F1 Macro=0.8362, F1 Weighted=0.8363
Klasa 0: 0.832
Klasa 1: 0.841
Random Forest: Accuracy=0.9370, Precision=0.9368, Recall=0.9414, F1 Macro=0.9369, F1 Weighted=0.9370
Klasa 0: 0.935
Klasa 1: 0.939
KNN: Accuracy=0.9760, Precision=0.9763, Recall=0.9771, F1 Macro=0.9759, F1 Weighted=0.9760
Klasa 0: 0.975
Klasa 1: 0.977
SVM: Accuracy=0.7954, Precision=0.7922, Recall=0.8181, F1 Macro=0.7949, F1 Weighted=0.7952
Klasa 0: 0.785
Klasa 1: 0.805


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9408, Precision=0.9397, Recall=0.9460, F1 Macro=0.9408, F1 Weighted=0.9408
Klasa 0: 0.939
Klasa 1: 0.943
Logistic Regression: Accuracy=0.7911, Precision=0.7913, Recall=0.8085, F1 Macro=0.7907, F1 Weighted=0.7910
Klasa 0: 0.782
Klasa 1: 0.8
Naive Bayes: Accuracy=0.7485, Precision=0.7648, Recall=0.7402, F1 Macro=0.7484, F1 Weighted=0.7485
Klasa 0: 0.745
Klasa 1: 0.752


{'Decision Tree': (0.8363202801892768,
  0.8442050087361678,
  0.8373167737742797,
  0.8361937262916062,
  0.8363394498815512),
 'Random Forest': (0.9369946719326353,
  0.9368352974992814,
  0.941367607769514,
  0.9369195816260021,
  0.9369892388326954),
 'KNN': (0.9759678080405381,
  0.9763347763347764,
  0.9771102606686404,
  0.9759425292779862,
  0.975967488424),
 'SVM': (0.795409665039681,
  0.7921968955390855,
  0.818109610802224,
  0.7949195938428276,
  0.7952404560060113),
 'XGBoost': (0.9408323708036812,
  0.9397460727350979,
  0.9459888800635425,
  0.9407579904644181,
  0.9408251752293256),
 'Logistic Regression': (0.7911248556205522,
  0.79125150166066,
  0.8085060293161961,
  0.7907332652743315,
  0.791022994910338),
 'Naive Bayes': (0.7484630574909646,
  0.7647717099373321,
  0.740197848220088,
  0.7484031924687902,
  0.748527405159727)}

In [22]:

train_and_evaluate(X_train_ft_users_empath, X_test_ft_users_empath, y_train_ft_users_empath, y_test_ft_users_empath, "FastText + User + Empath")



Rezultati za: FastText + User + Empath
Decision Tree: Accuracy=0.8329, Precision=0.8417, Recall=0.8329, F1 Macro=0.8328, F1 Weighted=0.8330
Klasa 0: 0.828
Klasa 1: 0.837
Random Forest: Accuracy=0.9368, Precision=0.9369, Recall=0.9410, F1 Macro=0.9368, F1 Weighted=0.9368
Klasa 0: 0.935
Klasa 1: 0.939
KNN: Accuracy=0.8047, Precision=0.8202, Recall=0.7960, F1 Macro=0.8046, F1 Weighted=0.8047
Klasa 0: 0.801
Klasa 1: 0.808
SVM: Accuracy=0.7950, Precision=0.7937, Recall=0.8144, F1 Macro=0.7945, F1 Weighted=0.7948
Klasa 0: 0.785
Klasa 1: 0.804


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Accuracy=0.9406, Precision=0.9393, Recall=0.9460, F1 Macro=0.9405, F1 Weighted=0.9406
Klasa 0: 0.938
Klasa 1: 0.943
Logistic Regression: Accuracy=0.7913, Precision=0.7924, Recall=0.8070, F1 Macro=0.7910, F1 Weighted=0.7912
Klasa 0: 0.782
Klasa 1: 0.8
Naive Bayes: Accuracy=0.7517, Precision=0.7681, Recall=0.7430, F1 Macro=0.7516, F1 Weighted=0.7517
Klasa 0: 0.748
Klasa 1: 0.755


{'Decision Tree': (0.8329296918663139,
  0.8416636264137176,
  0.8329121236190339,
  0.8328110365973695,
  0.8329535888715279),
 'Random Forest': (0.9368456350832743,
  0.9368799424874191,
  0.941006570871543,
  0.9367713405312572,
  0.9368407090314559),
 'KNN': (0.8046872089124036,
  0.8201770701584703,
  0.7960141526464004,
  0.8046320426248359,
  0.8047371154430635),
 'SVM': (0.794962554491598,
  0.7936664320900774,
  0.8143548270633258,
  0.794538014623074,
  0.7948369324485347),
 'XGBoost': (0.9406088155296397,
  0.9393417939341794,
  0.9459888800635425,
  0.9405332078819613,
  0.9406010730537018),
 'Logistic Regression': (0.7913111516822534,
  0.7923993193420307,
  0.806989674344718,
  0.7909509996945101,
  0.7912287111464594),
 'Naive Bayes': (0.7516673497522263,
  0.7681397432069275,
  0.7430139360242617,
  0.7516105099415922,
  0.7517307695409385)}