In [10]:
import pandas as pd

data = pd.read_csv("../../data/cleaned_data.csv")
data.head()

Unnamed: 0,text,label
0,·ûì·û∂·ûô·û∑·ûÄ·û∂ ·ûò·ûá·üí·ûà·ûò·ûé·üí·ûå·ûõ ·ûü·û∑·ûë·üí·ûí·û∑ ·ûò·ûì·ûª·ûü·üí·ûü ·ûÄ·ûò·üí·ûñ·ûª·ûá·û∂ ·û¢·üí·ûì·ûÄ·ûü·üí·ûö...,neutral
1,·ûÄ·û∂·ûö·ûÉ·ûª·üÜ ·ûÄ·ûâ·üí·ûâ·û∂ ·ûü·üÅ·ûÑ ·ûí·û∂·ûö·û∏ ·ûÄ·û∂·ûì·üã·ûè·üÇ ·ûô·ûº·ûö ·ûö·ûä·üí·ûã·û∂·ûó·û∑·ûî·û∂·ûõ ·û†·üä...,positive
2,·ûî·üí·ûö·ûó·ûñ ·ûî·ûÑ·üí·û†·ûæ·ûî ·ûî·ûì·üí·ûë·ûî·üã ·ûÅ·üí·ûë·ûÑ·üã ·ûÖ·üÜ·ûé·û∂·ûô ·ûá·û∂·ûÑ ·ü°·ü†·ûò·üâ·û∫·ûì ·ûä·ûª·ûõ...,neutral
3,1956 ·ûî·û∂·ûì ·ûî·ûÑ·üí·û†·û∂·ûâ ·ûï·ûº·ûü·üí·ûú·üê·ûö ·ûî·û∂·ûì ·ûï·üí·ûë·üÅ·ûö ·ûä·ûæ·ûò ·ûî·üÇ·ûÄ ·û¢·û∂·ûö ...,neutral
4,·ûä·ûö·û∂·ûî·ûé·û∂ ·ûò·û∑·ûì ·ûî·û∂·ûì ·ûè·û∂·üÜ·ûÑ·ûÖ·û∑·ûè·üí·ûè ·ûÅ·û∑·ûè·ûÅ·üÜ ·ûî·üí·ûö·ûπ·ûÑ ·ûö·üÄ·ûì ·ûî·üí·ûö·ûπ·ûÑ...,negative


In [11]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['text']  # cleaned text
y = data['label']      # labels

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text using BoW
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(cm)

    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.6431784107946027

Classification Report:
              precision    recall  f1-score   support

    negative       0.56      0.49      0.52       651
     neutral       0.46      0.06      0.11       184
    positive       0.68      0.82      0.74      1166

    accuracy                           0.64      2001
   macro avg       0.57      0.46      0.46      2001
weighted avg       0.62      0.64      0.61      2001

Confusion Matrix:
[[318   5 328]
 [ 49  11 124]
 [200   8 958]]
--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.567216391804098

Classification Report:
              precision    recall  f1-score   support

    negative       0.52      0.54      0.53       651
     neutral       0.22      0.47      0.30       184
    positive       0.76      0.60      0.67      1166

    accuracy                           0.57      2001
   macro avg       0.50      0.54      0.50      2001
weighted avg       0.63 

In [13]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values("macro_f1", ascending=False))


                model  accuracy  macro_f1
1  LogisticRegression  0.567216  0.497819
2           LinearSVC  0.582209  0.480756
0       MultinomialNB  0.643178  0.457242
3        RandomForest  0.629185  0.442789
4        DecisionTree  0.532234  0.439783
5    GradientBoosting  0.612194  0.330540


In [14]:
best_overall = df_results.loc[df_results["macro_f1"].idxmax()]

print("üèÜ Best model overall")
print(best_overall)


üèÜ Best model overall
model       LogisticRegression
accuracy              0.567216
macro_f1              0.497819
Name: 1, dtype: object


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# 2. Vectorize text using BoW + n-grams
vectorizer = CountVectorizer(
    ngram_range=(1, 2),   # unigram + bigram
    min_df=2,             # ignore very rare words
    max_df=0.9            # ignore very common words
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. Define models
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "LinearSVC": LinearSVC(class_weight='balanced', max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 4. Train & evaluate
# 5. Train & evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(cm)

    print("-" * 50)


=== MultinomialNB ===
Accuracy: 0.656671664167916

Classification Report:
              precision    recall  f1-score   support

    negative       0.58      0.55      0.56       651
     neutral       0.56      0.05      0.10       184
    positive       0.69      0.81      0.75      1166

    accuracy                           0.66      2001
   macro avg       0.61      0.47      0.47      2001
weighted avg       0.64      0.66      0.63      2001

Confusion Matrix:
[[355   5 291]
 [ 48  10 126]
 [214   3 949]]
--------------------------------------------------
=== LogisticRegression ===
Accuracy: 0.5987006496751625

Classification Report:
              precision    recall  f1-score   support

    negative       0.52      0.55      0.53       651
     neutral       0.26      0.39      0.31       184
    positive       0.74      0.66      0.70      1166

    accuracy                           0.60      2001
   macro avg       0.51      0.53      0.51      2001
weighted avg       0.63 

In [16]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1
    })

df_results = pd.DataFrame(results)
print(df_results.sort_values("macro_f1", ascending=False))


                model  accuracy  macro_f1
1  LogisticRegression  0.598701  0.514242
2           LinearSVC  0.601699  0.498400
0       MultinomialNB  0.656672  0.469517
3        RandomForest  0.641179  0.462361
4        DecisionTree  0.537231  0.444169
5    GradientBoosting  0.616192  0.336439


In [17]:
best_overall = df_results.loc[df_results["macro_f1"].idxmax()]

print("üèÜ Best model overall")
print(best_overall)


üèÜ Best model overall
model       LogisticRegression
accuracy              0.598701
macro_f1              0.514242
Name: 1, dtype: object
