In [1]:
import pandas as pd

data = pd.read_csv("../../data/cleaned_data.csv")
data.head()

Unnamed: 0,text,label
0,·ûì·û∂·ûô·û∑·ûÄ·û∂ ·ûò·ûá·üí·ûà·ûò·ûé·üí·ûå·ûõ ·ûü·û∑·ûë·üí·ûí·û∑ ·ûò·ûì·ûª·ûü·üí·ûü ·ûÄ·ûò·üí·ûñ·ûª·ûá·û∂ ·û¢·üí·ûì·ûÄ·ûü·üí·ûö...,neutral
1,·ûÄ·û∂·ûö·ûÉ·ûª·üÜ ·ûÄ·ûâ·üí·ûâ·û∂ ·ûü·üÅ·ûÑ ·ûí·û∂·ûö·û∏ ·ûÄ·û∂·ûì·üã·ûè·üÇ ·ûô·ûº·ûö ·ûö·ûä·üí·ûã·û∂·ûó·û∑·ûî·û∂·ûõ ·û†·üä...,positive
2,·ûî·üí·ûö·ûó·ûñ ·ûî·ûÑ·üí·û†·ûæ·ûî ·ûî·ûì·üí·ûë·ûî·üã ·ûÅ·üí·ûë·ûÑ·üã ·ûÖ·üÜ·ûé·û∂·ûô ·ûá·û∂·ûÑ ·ü°·ü†·ûò·üâ·û∫·ûì ·ûä·ûª·ûõ...,neutral
3,1956 ·ûî·û∂·ûì ·ûî·ûÑ·üí·û†·û∂·ûâ ·ûï·ûº·ûü·üí·ûú·üê·ûö ·ûî·û∂·ûì ·ûï·üí·ûë·üÅ·ûö ·ûä·ûæ·ûò ·ûî·üÇ·ûÄ ·û¢·û∂·ûö ...,neutral
4,·ûä·ûö·û∂·ûî·ûé·û∂ ·ûò·û∑·ûì ·ûî·û∂·ûì ·ûè·û∂·üÜ·ûÑ·ûÖ·û∑·ûè·üí·ûè ·ûÅ·û∑·ûè·ûÅ·üÜ ·ûî·üí·ûö·ûπ·ûÑ ·ûö·üÄ·ûì ·ûî·üí·ûö·ûπ·ûÑ...,negative


In [2]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['text']  # cleaned text
y = data['label']      # labels

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(cm)

    print("-" * 50)



=== MultinomialNB ===
Accuracy: 0.616191904047976

Classification Report:
              precision    recall  f1-score   support

    negative       0.66      0.16      0.26       651
     neutral       0.00      0.00      0.00       184
    positive       0.61      0.97      0.75      1166

    accuracy                           0.62      2001
   macro avg       0.43      0.38      0.34      2001
weighted avg       0.57      0.62      0.52      2001

Confusion Matrix:
[[ 105    1  545]
 [  15    0  169]
 [  38    0 1128]]
--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.5812093953023488

Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.56      0.55       651
     neutral       0.21      0.44      0.29       184
    positive       0.76      0.62      0.68      1166

    accuracy                           0.58      2001
   macro avg       0.50      0.54      0.51      2001
weighted avg   

In [4]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values("macro_f1", ascending=False))


                model  accuracy  macro_f1
1  LogisticRegression  0.581209  0.505332
2           LinearSVC  0.602199  0.500015
4        DecisionTree  0.525237  0.434113
3        RandomForest  0.631684  0.426798
5    GradientBoosting  0.615692  0.352832
0       MultinomialNB  0.616192  0.336527


In [5]:
best_overall = df_results.loc[df_results["macro_f1"].idxmax()]

print("üèÜ Best model overall")
print(best_overall)


üèÜ Best model overall
model       LogisticRegression
accuracy              0.581209
macro_f1              0.505332
Name: 1, dtype: object


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text (TF-IDF + n-gram)
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # (n-gram) unigram + bigram
    min_df=2,             # remove rare words
    max_df=0.9            # remove too frequent words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(cm)

    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.6291854072963519

Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.18      0.30       651
     neutral       0.00      0.00      0.00       184
    positive       0.62      0.98      0.76      1166

    accuracy                           0.63      2001
   macro avg       0.46      0.39      0.35      2001
weighted avg       0.60      0.63      0.54      2001

Confusion Matrix:
[[ 120    0  531]
 [  13    0  171]
 [  27    0 1139]]
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


=== LogisticRegression ===
Accuracy: 0.6226886556721639

Classification Report:
              precision    recall  f1-score   support

    negative       0.55      0.59      0.57       651
     neutral       0.27      0.41      0.33       184
    positive       0.77      0.67      0.72      1166

    accuracy                           0.62      2001
   macro avg       0.53      0.56      0.54      2001
weighted avg       0.65      0.62      0.63      2001

Confusion Matrix:
[[383  90 178]
 [ 46  76  62]
 [264 115 787]]
--------------------------------------------------
=== LinearSVC ===
Accuracy: 0.6301849075462269

Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.51      0.53       651
     neutral       0.30      0.27      0.28       184
    positive       0.72      0.75      0.74      1166

    accuracy                           0.63      2001
   macro avg       0.52      0.51      0.52      2001
weighted avg       0.62    

In [7]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values("macro_f1", ascending=False))


                model  accuracy  macro_f1
1  LogisticRegression  0.622689  0.538187
2           LinearSVC  0.630185  0.515438
3        RandomForest  0.641679  0.441019
4        DecisionTree  0.521739  0.432811
5    GradientBoosting  0.623188  0.360277
0       MultinomialNB  0.629185  0.351166


In [8]:
best_overall = df_results.loc[df_results["macro_f1"].idxmax()]

print("üèÜ Best model overall")
print(best_overall)


üèÜ Best model overall
model       LogisticRegression
accuracy              0.622689
macro_f1              0.538187
Name: 1, dtype: object
