In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from gensim.models import Word2Vec


In [2]:
df = pd.read_csv("final_merged.csv")

In [3]:
# Grupa 1: Pozitivne emocije
positive_emotions = [
    'cheerfulness', 'joy', 'contentment', 'love', 'warmth',
    'positive_emotion', 'fun', 'giving', 'friends'
]

# Grupa 2: Negativne emocije
negative_emotions = [
    'sadness', 'disgust', 'suffering', 'negative_emotion',
    'weakness', 'neglect'
]

# Grupa 3: Socijalne emocije
social_emotions = [
    'pride', 'shame', 'politeness', 'affection', 'leader',
    'dominant_personality', 'childish', 'trust', 'sympathy'
]

# Grupa 4: Intenzivne emocije
intense_emotions = [
    'surprise', 'rage', 'horror', 'fear', 'exasperation',
    'nervousness', 'irritability', 'torment', 'pain', 'hate', 'anger'
]

# Grupa 5: Kognitivno-emotivne emocije
cognitive_emotions = [
    'anticipation', 'confusion', 'envy', 'disappointment',
    'optimism', 'zest', 'achievement'
]

# Kreiranje novih kolona kao zbir postojećih
df['emotion_positive'] = df[[f'empath_result.{x}' for x in positive_emotions]].sum(axis=1)
df['emotion_negative'] = df[[f'empath_result.{x}' for x in negative_emotions]].sum(axis=1)
df['emotion_social']   = df[[f'empath_result.{x}' for x in social_emotions]].sum(axis=1)
df['emotion_intense']  = df[[f'empath_result.{x}' for x in intense_emotions]].sum(axis=1)
df['emotion_cognitive'] = df[[f'empath_result.{x}' for x in cognitive_emotions]].sum(axis=1)


In [4]:
feature_cols_users = ['followers_count', 'favourites_count','friends_count',
       'statuses_count', 'listed_count', 'cred','BotScore',
       'normalize_influence']

In [5]:
feature_cols_empath = ['emotion_positive', 'emotion_negative',
       'emotion_social', 'emotion_intense', 'emotion_cognitive']

In [6]:

df['tweet_tokens'] = df['tweet_tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# TF-IDF + User/Empath Features

In [7]:
from scipy.sparse import hstack


vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['tweet_new_x'])

X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values
X_users_empath = df[feature_cols_users + feature_cols_empath].values

X_tfidf_users = hstack([X_text, X_users])
X_tfidf_empath = hstack([X_text, X_empath])
X_tfidf_users_empath = hstack([X_text, X_users_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_users, X_test_users, y_train_users, y_test_users = train_test_split(X_tfidf_users, y, test_size=0.2, random_state=1)
X_train_empath, X_test_empath, y_train_empath, y_test_empath = train_test_split(X_tfidf_empath, y, test_size=0.2, random_state=1)
X_train_users_empath, X_test_users_empath, y_train_users_empath, y_test_users_empath = train_test_split(X_tfidf_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

def train_and_evaluate(X_train, X_test, y_train, y_test, title):
    print(f"\nRezultati za: {title}")
  
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(name)
        report = classification_report(y_test, y_pred, digits=3)
        print(report)
  




In [8]:

results_users = train_and_evaluate(X_train_users, X_test_users, y_train_users, y_test_users, "TF-IDF + User features")



Rezultati za: TF-IDF + User features
Random Forest
              precision    recall  f1-score   support

           0      0.972     0.973     0.973     12990
           1      0.975     0.974     0.974     13849

    accuracy                          0.973     26839
   macro avg      0.973     0.973     0.973     26839
weighted avg      0.973     0.973     0.973     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.958     0.959     0.959     12990
           1      0.962     0.961     0.961     13849

    accuracy                          0.960     26839
   macro avg      0.960     0.960     0.960     26839
weighted avg      0.960     0.960     0.960     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.940     0.926     0.933     12990
           1      0.932     0.944     0.938     13849

    accuracy                          0.936     26839
   macro avg      0.936     0.935     0.936     26

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.977     0.945     0.961     12990
           1      0.950     0.979     0.965     13849

    accuracy                          0.963     26839
   macro avg      0.964     0.962     0.963     26839
weighted avg      0.963     0.963     0.963     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.977     0.971     0.974     12990
           1      0.973     0.979     0.976     13849

    accuracy                          0.975     26839
   macro avg      0.975     0.975     0.975     26839
weighted avg      0.975     0.975     0.975     26839



In [9]:

results_empath = train_and_evaluate(X_train_empath, X_test_empath, y_train_empath, y_test_empath, "TF-IDF + Empath features")



Rezultati za: TF-IDF + Empath features
Random Forest
              precision    recall  f1-score   support

           0      0.975     0.980     0.978     12990
           1      0.981     0.977     0.979     13849

    accuracy                          0.978     26839
   macro avg      0.978     0.978     0.978     26839
weighted avg      0.978     0.978     0.978     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.968     0.964     0.966     12990
           1      0.967     0.970     0.968     13849

    accuracy                          0.967     26839
   macro avg      0.967     0.967     0.967     26839
weighted avg      0.967     0.967     0.967     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.953     0.933     0.943     12990
           1      0.939     0.957     0.948     13849

    accuracy                          0.945     26839
   macro avg      0.946     0.945     0.945     

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.976     0.945     0.961     12990
           1      0.950     0.978     0.964     13849

    accuracy                          0.962     26839
   macro avg      0.963     0.962     0.962     26839
weighted avg      0.963     0.962     0.962     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.976     0.971     0.973     12990
           1      0.973     0.978     0.975     13849

    accuracy                          0.974     26839
   macro avg      0.974     0.974     0.974     26839
weighted avg      0.974     0.974     0.974     26839



In [10]:

results_users_empath = train_and_evaluate(X_train_users_empath, X_test_users_empath, y_train_users_empath, y_test_users_empath, "TF-IDF + User + Empath features")


Rezultati za: TF-IDF + User + Empath features
Random Forest
              precision    recall  f1-score   support

           0      0.970     0.972     0.971     12990
           1      0.973     0.972     0.973     13849

    accuracy                          0.972     26839
   macro avg      0.972     0.972     0.972     26839
weighted avg      0.972     0.972     0.972     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.960     0.958     0.959     12990
           1      0.960     0.962     0.961     13849

    accuracy                          0.960     26839
   macro avg      0.960     0.960     0.960     26839
weighted avg      0.960     0.960     0.960     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.940     0.926     0.933     12990
           1      0.932     0.944     0.938     13849

    accuracy                          0.936     26839
   macro avg      0.936     0.935     0.9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.976     0.943     0.959     12990
           1      0.949     0.978     0.963     13849

    accuracy                          0.961     26839
   macro avg      0.962     0.961     0.961     26839
weighted avg      0.962     0.961     0.961     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.977     0.971     0.974     12990
           1      0.973     0.979     0.976     13849

    accuracy                          0.975     26839
   macro avg      0.975     0.975     0.975     26839
weighted avg      0.975     0.975     0.975     26839



# Word2Vec + User/Empath

In [11]:
sentences = df['tweet_tokens'].tolist()

w2v_model = Word2Vec(sentences, vector_size=300, window=10, min_count=5, workers=4, sg=1, negative=10)

def vectorize_tweet(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vectors, axis=0)

X_w2v = np.array([vectorize_tweet(tokens) for tokens in df['tweet_tokens']])
X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values

X_w2v_users = np.hstack([X_w2v, X_users])
X_w2v_empath = np.hstack([X_w2v, X_empath])
X_w2v_users_empath = np.hstack([X_w2v, X_users, X_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_w2v_users, X_test_w2v_users, y_train_w2v_users, y_test_w2v_users = train_test_split(X_w2v_users, y, test_size=0.2, random_state=1)
X_train_w2v_empath, X_test_w2v_empath, y_train_w2v_empath, y_test_w2v_empath = train_test_split(X_w2v_empath, y, test_size=0.2, random_state=1)
X_train_w2v_users_empath, X_test_w2v_users_empath, y_train_w2v_users_empath, y_test_w2v_users_empath = train_test_split(X_w2v_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(), 
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}


In [12]:
train_and_evaluate(X_train_w2v_users, X_test_w2v_users, y_train_w2v_users, y_test_w2v_users, "Word2Vec + User")



Rezultati za: Word2Vec + User
Random Forest
              precision    recall  f1-score   support

           0      0.960     0.949     0.954     12990
           1      0.952     0.963     0.958     13849

    accuracy                          0.956     26839
   macro avg      0.956     0.956     0.956     26839
weighted avg      0.956     0.956     0.956     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.842     0.845     0.843     12990
           1      0.854     0.851     0.853     13849

    accuracy                          0.848     26839
   macro avg      0.848     0.848     0.848     26839
weighted avg      0.848     0.848     0.848     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.768     0.801     0.784     12990
           1      0.805     0.773     0.789     13849

    accuracy                          0.786     26839
   macro avg      0.787     0.787     0.786     26839
wei

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.968     0.963     0.965     12990
           1      0.965     0.970     0.968     13849

    accuracy                          0.967     26839
   macro avg      0.967     0.966     0.966     26839
weighted avg      0.967     0.967     0.967     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.909     0.903     0.906     12990
           1      0.910     0.915     0.912     13849

    accuracy                          0.909     26839
   macro avg      0.909     0.909     0.909     26839
weighted avg      0.909     0.909     0.909     26839



In [13]:

train_and_evaluate(X_train_w2v_empath, X_test_w2v_empath, y_train_w2v_empath, y_test_w2v_empath, "Word2Vec + Empath")



Rezultati za: Word2Vec + Empath
Random Forest
              precision    recall  f1-score   support

           0      0.959     0.948     0.954     12990
           1      0.952     0.962     0.957     13849

    accuracy                          0.955     26839
   macro avg      0.956     0.955     0.955     26839
weighted avg      0.955     0.955     0.955     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.844     0.846     0.845     12990
           1      0.855     0.854     0.854     13849

    accuracy                          0.850     26839
   macro avg      0.850     0.850     0.850     26839
weighted avg      0.850     0.850     0.850     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.768     0.796     0.782     12990
           1      0.802     0.774     0.788     13849

    accuracy                          0.785     26839
   macro avg      0.785     0.785     0.785     26839
w

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.969     0.964     0.966     12990
           1      0.966     0.971     0.969     13849

    accuracy                          0.967     26839
   macro avg      0.967     0.967     0.967     26839
weighted avg      0.967     0.967     0.967     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.908     0.905     0.906     12990
           1      0.911     0.914     0.912     13849

    accuracy                          0.909     26839
   macro avg      0.909     0.909     0.909     26839
weighted avg      0.909     0.909     0.909     26839



In [14]:

train_and_evaluate(X_train_w2v_users_empath, X_test_w2v_users_empath, y_train_w2v_users_empath, y_test_w2v_users_empath, "Word2Vec + User + Empath")


Rezultati za: Word2Vec + User + Empath
Random Forest
              precision    recall  f1-score   support

           0      0.961     0.949     0.955     12990
           1      0.953     0.964     0.958     13849

    accuracy                          0.957     26839
   macro avg      0.957     0.957     0.957     26839
weighted avg      0.957     0.957     0.957     26839

Decision Tree
              precision    recall  f1-score   support

           0      0.844     0.847     0.846     12990
           1      0.856     0.853     0.855     13849

    accuracy                          0.850     26839
   macro avg      0.850     0.850     0.850     26839
weighted avg      0.850     0.850     0.850     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.768     0.801     0.784     12990
           1      0.805     0.773     0.789     13849

    accuracy                          0.787     26839
   macro avg      0.787     0.787     0.787     

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.969     0.963     0.966     12990
           1      0.965     0.971     0.968     13849

    accuracy                          0.967     26839
   macro avg      0.967     0.967     0.967     26839
weighted avg      0.967     0.967     0.967     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.909     0.904     0.906     12990
           1      0.910     0.915     0.912     13849

    accuracy                          0.909     26839
   macro avg      0.909     0.909     0.909     26839
weighted avg      0.909     0.909     0.909     26839



# FastText + User/Empath

In [15]:
from gensim.models import FastText


sentences = df['tweet_tokens'].tolist()

fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, sg=1)
fasttext_model.save('fasttext_model.model')

def get_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

X_fasttext = np.array([get_vector(tokens, fasttext_model) for tokens in df['tweet_tokens']])
X_users = df[feature_cols_users].values
X_empath = df[feature_cols_empath].values

X_fasttext_users = np.hstack([X_fasttext, X_users])
X_fasttext_empath = np.hstack([X_fasttext, X_empath])
X_fasttext_users_empath = np.hstack([X_fasttext, X_users, X_empath])

y = df['BinaryNumTarget'].astype(int)

X_train_ft_users, X_test_ft_users, y_train_ft_users, y_test_ft_users = train_test_split(X_fasttext_users, y, test_size=0.2, random_state=1)
X_train_ft_empath, X_test_ft_empath, y_train_ft_empath, y_test_ft_empath = train_test_split(X_fasttext_empath, y, test_size=0.2, random_state=1)
X_train_ft_users_empath, X_test_ft_users_empath, y_train_ft_users_empath, y_test_ft_users_empath = train_test_split(X_fasttext_users_empath, y, test_size=0.2, random_state=1)

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB()
}

In [16]:

train_and_evaluate(X_train_ft_users, X_test_ft_users, y_train_ft_users, y_test_ft_users, "FastText + User ")




Rezultati za: FastText + User 
Decision Tree
              precision    recall  f1-score   support

           0      0.819     0.830     0.824     12990
           1      0.838     0.828     0.833     13849

    accuracy                          0.829     26839
   macro avg      0.828     0.829     0.829     26839
weighted avg      0.829     0.829     0.829     26839

Random Forest
              precision    recall  f1-score   support

           0      0.941     0.930     0.936     12990
           1      0.935     0.946     0.940     13849

    accuracy                          0.938     26839
   macro avg      0.938     0.938     0.938     26839
weighted avg      0.938     0.938     0.938     26839

KNN
              precision    recall  f1-score   support

           0      0.790     0.815     0.802     12990
           1      0.821     0.797     0.809     13849

    accuracy                          0.806     26839
   macro avg      0.806     0.806     0.806     26839
weighted a

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.946     0.940     0.943     12990
           1      0.944     0.950     0.947     13849

    accuracy                          0.945     26839
   macro avg      0.945     0.945     0.945     26839
weighted avg      0.945     0.945     0.945     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.787     0.766     0.776     12990
           1      0.786     0.806     0.796     13849

    accuracy                          0.786     26839
   macro avg      0.786     0.786     0.786     26839
weighted avg      0.786     0.786     0.786     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.729     0.753     0.741     12990
           1      0.761     0.737     0.749     13849

    accuracy                          0.745     26839
   macro avg      0.745     0.745     0.745     26839
weighted avg      0.745     0.745 

In [17]:

train_and_evaluate(X_train_ft_empath, X_test_ft_empath, y_train_ft_empath, y_test_ft_empath, "FastText + Empath")




Rezultati za: FastText + Empath
Decision Tree
              precision    recall  f1-score   support

           0      0.821     0.833     0.827     12990
           1      0.841     0.829     0.835     13849

    accuracy                          0.831     26839
   macro avg      0.831     0.831     0.831     26839
weighted avg      0.831     0.831     0.831     26839

Random Forest
              precision    recall  f1-score   support

           0      0.941     0.933     0.937     12990
           1      0.938     0.945     0.941     13849

    accuracy                          0.939     26839
   macro avg      0.939     0.939     0.939     26839
weighted avg      0.939     0.939     0.939     26839

KNN
              precision    recall  f1-score   support

           0      0.975     0.975     0.975     12990
           1      0.976     0.976     0.976     13849

    accuracy                          0.975     26839
   macro avg      0.975     0.975     0.975     26839
weighted 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.947     0.937     0.942     12990
           1      0.942     0.951     0.946     13849

    accuracy                          0.944     26839
   macro avg      0.944     0.944     0.944     26839
weighted avg      0.944     0.944     0.944     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.788     0.765     0.776     12990
           1      0.785     0.807     0.796     13849

    accuracy                          0.787     26839
   macro avg      0.787     0.786     0.786     26839
weighted avg      0.787     0.787     0.787     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.727     0.749     0.738     12990
           1      0.758     0.736     0.747     13849

    accuracy                          0.743     26839
   macro avg      0.743     0.743     0.743     26839
weighted avg      0.743     0.743 

In [18]:

train_and_evaluate(X_train_ft_users_empath, X_test_ft_users_empath, y_train_ft_users_empath, y_test_ft_users_empath, "FastText + User + Empath")



Rezultati za: FastText + User + Empath
Decision Tree
              precision    recall  f1-score   support

           0      0.818     0.828     0.823     12990
           1      0.837     0.827     0.832     13849

    accuracy                          0.828     26839
   macro avg      0.827     0.828     0.827     26839
weighted avg      0.828     0.828     0.828     26839

Random Forest
              precision    recall  f1-score   support

           0      0.940     0.932     0.936     12990
           1      0.937     0.944     0.940     13849

    accuracy                          0.938     26839
   macro avg      0.938     0.938     0.938     26839
weighted avg      0.938     0.938     0.938     26839

KNN
              precision    recall  f1-score   support

           0      0.790     0.815     0.803     12990
           1      0.821     0.797     0.809     13849

    accuracy                          0.806     26839
   macro avg      0.806     0.806     0.806     26839
we

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0      0.943     0.942     0.942     12990
           1      0.945     0.947     0.946     13849

    accuracy                          0.944     26839
   macro avg      0.944     0.944     0.944     26839
weighted avg      0.944     0.944     0.944     26839

Logistic Regression
              precision    recall  f1-score   support

           0      0.786     0.767     0.776     12990
           1      0.786     0.805     0.795     13849

    accuracy                          0.786     26839
   macro avg      0.786     0.786     0.786     26839
weighted avg      0.786     0.786     0.786     26839

Naive Bayes
              precision    recall  f1-score   support

           0      0.731     0.753     0.742     12990
           1      0.762     0.739     0.750     13849

    accuracy                          0.746     26839
   macro avg      0.746     0.746     0.746     26839
weighted avg      0.747     0.746 