In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy

In [2]:
##Function get dataframe from text file
def load_txt_data(filepath):
    
    texts = []
    labels = []
    labels_numeric =[]
    
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        
        
        for line in lines[1:]:  
            parts = line.strip().split("\t")  
            
            if len(parts) >= 2:
                tweet = " ".join(parts[1:-2])  
                label = parts[-1]  
                texts.append(tweet)
                
                
                sentiment_mapping = {
                    "0: neutral or mixed emotional state can be inferred": 'neutral',
                    "1: slightly positive emotional state can be inferred": 'neutral',
                    "2: moderately positive emotional state can be inferred": 'positive',
                    "3: very positive emotional state can be inferred": 'positive',
                    "-1: slightly negative emotional state can be inferred": 'neutral',
                    "-2: moderately negative emotional state can be inferred": 'negative',
                    "-3: very negative emotional state can be inferred": 'negative'
                }
                
                
                labels.append(sentiment_mapping.get(label, np.nan))
    
    df = pd.DataFrame({"Tweet": texts, "Sentiment": labels})
    
    df.dropna(inplace=True)
    df["Sentiment"] = df["Sentiment"].astype(str)
    
    return df

In [5]:
# Loading training data
train_data = load_txt_data("train.txt")
train_data.shape
print(len(train_data['Tweet']))

1181


In [6]:
tweet = train_data['Tweet'][0]

In [7]:
tweet

'@liamch88 yeah! :) playing well '

In [8]:
mytext=train_data['Tweet'][0]

In [9]:
print(mytext)

@liamch88 yeah! :) playing well 


In [10]:
train_data[:5]

Unnamed: 0,Tweet,Sentiment
0,@liamch88 yeah! :) playing well,neutral
1,At least I don't have a guy trying to discoura...,neutral
2,UPLIFT: If you're still discouraged it means y...,neutral
3,"...at your age, the heyday in the blood is tam...",neutral
4,i was so embarrassed when she saw us i was lik...,negative


In [11]:
train_data['Sentiment'].value_counts()

Sentiment
neutral     586
negative    378
positive    217
Name: count, dtype: int64

In [12]:
#Loading test datasets
test_data = load_txt_data("test.txt") 
test_data.shape

(937, 2)

### TASK 1: Extract Unigram Features ###

In [28]:
### TASK 1: Extract Unigram Features ###

X_train, y_train = train_data["Tweet"], train_data["Sentiment"]
X_test, y_test = test_data["Tweet"], test_data["Sentiment"]

vectorizer = CountVectorizer(ngram_range=(1, 1))  # Unigrams
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)


print(f"Number of features in training set: {X_train_features.shape[1]}")
print(f"Number of features in test set: {X_test_features.shape[1]}")

Number of features in training set: 5038
Number of features in test set: 5038


In [31]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
def multinomialNB_model(X_train_features, y_train, X_test_features, y_test):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_features, y_train)


    y_pred = nb_classifier.predict(X_test_features)


    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")


    class_report = classification_report(y_test, y_pred, digits=4, zero_division=0)

    print("Classification Report:\n", class_report)
    
    micro_precision = precision_score(y_test, y_pred, average='micro')
    micro_recall = recall_score(y_test, y_pred, average='micro')
    micro_f1 = f1_score(y_test, y_pred, average='micro')

    
    print(f"Micro Precision: {micro_precision:.4f}")
    print(f"Micro Recall: {micro_recall:.4f}")
    print(f"Micro F1-Score: {micro_f1:.4f}")

In [32]:
multinomialNB_model(X_train_features, y_train, X_test_features, y_test)

Model Accuracy: 0.5571
Classification Report:
               precision    recall  f1-score   support

    negative     0.5825    0.4346    0.4978       260
     neutral     0.5361    0.8263    0.6503       449
    positive     0.7451    0.1667    0.2724       228

    accuracy                         0.5571       937
   macro avg     0.6212    0.4759    0.4735       937
weighted avg     0.5998    0.5571    0.5160       937

Micro Precision: 0.5571
Micro Recall: 0.5571
Micro F1-Score: 0.5571


### Task 2 (feature selection 1): a. Remove features with low variance 0.001 ###

In [33]:
from sklearn.feature_selection import VarianceThreshold

In [34]:
def variance_removal(threshold):
    print("Threshold: ", threshold)
    selector = VarianceThreshold(threshold = threshold)

    X_train_features_filtered_var_thr = selector.fit(X_train_features).transform(X_train_features)
    print ("Train feature space before filtering: ", X_train_features.shape)
    print ("Train feature space after filtering: ", X_train_features_filtered_var_thr.shape)

    X_test_features_filtered_var_thr = selector.transform(X_test_features)
    print ("Test feature space before filtering: ", X_test_features.shape)
    print ("Test feature space after filtering: ", X_test_features_filtered_var_thr.shape)
    
    return X_train_features_filtered_var_thr, X_test_features_filtered_var_thr

In [35]:
### a. Task 2 (feature selection 1): Remove features with low variance 0.001 ###
X_train_features_filtered_var_thr_one, X_test_features_filtered_var_thr_one = variance_removal(0.001)


Threshold:  0.001
Train feature space before filtering:  (1181, 5038)
Train feature space after filtering:  (1181, 1596)
Test feature space before filtering:  (937, 5038)
Test feature space after filtering:  (937, 1596)


In [36]:
multinomialNB_model(X_train_features_filtered_var_thr_one, y_train, X_test_features_filtered_var_thr_one, y_test)

Model Accuracy: 0.5603
Classification Report:
               precision    recall  f1-score   support

    negative     0.5184    0.4885    0.5030       260
     neutral     0.5587    0.7105    0.6255       449
    positive     0.6529    0.3465    0.4527       228

    accuracy                         0.5603       937
   macro avg     0.5766    0.5151    0.5271       937
weighted avg     0.5704    0.5603    0.5495       937

Micro Precision: 0.5603
Micro Recall: 0.5603
Micro F1-Score: 0.5603


### Task 2 (feature selection 1): b. Remove features with low variance 0.005 ###

In [37]:
### b. Task 2 (feature selection 1): Remove features with low variance 0.005 ###
X_train_features_filtered_var_thr_five, X_test_features_filtered_var_thr_five = variance_removal(0.005)

Threshold:  0.005
Train feature space before filtering:  (1181, 5038)
Train feature space after filtering:  (1181, 490)
Test feature space before filtering:  (937, 5038)
Test feature space after filtering:  (937, 490)


In [38]:
multinomialNB_model(X_train_features_filtered_var_thr_five, y_train, X_test_features_filtered_var_thr_five, y_test)

Model Accuracy: 0.5390
Classification Report:
               precision    recall  f1-score   support

    negative     0.4773    0.4846    0.4809       260
     neutral     0.5493    0.6325    0.5880       449
    positive     0.6090    0.4167    0.4948       228

    accuracy                         0.5390       937
   macro avg     0.5452    0.5113    0.5212       937
weighted avg     0.5438    0.5390    0.5356       937

Micro Precision: 0.5390
Micro Recall: 0.5390
Micro F1-Score: 0.5390


### Task 3 (feature selection 2): Select top k-best features using information gain (mutual information) k=1000 ###

In [39]:
### Task 3 (feature selection 2): Select top k-best features using information gain (mutual information) k=1000 ###

from sklearn.feature_selection import SelectKBest, chi2

def mutual_information_k(k):
    selector = SelectKBest(chi2, k=k)
    X_train_features_filtered_kbest = selector.fit_transform(X_train_features, y_train)
    print ("Train feature space before filtering: ", X_train_features.shape)
    print ("Train feature space after filtering: ", X_train_features_filtered_kbest.shape)

    X_test_features_filtered_kbest = selector.transform(X_test_features)
    print ("Test feature space before filtering: ", X_test_features.shape)
    print ("Test feature space after filtering: ", X_test_features_filtered_kbest.shape)
    
    return X_train_features_filtered_kbest, X_test_features_filtered_kbest

In [40]:
X_train_features_filtered_kbest_one, X_test_features_filtered_kbest_one = mutual_information_k(1000)

Train feature space before filtering:  (1181, 5038)
Train feature space after filtering:  (1181, 1000)
Test feature space before filtering:  (937, 5038)
Test feature space after filtering:  (937, 1000)


In [41]:
multinomialNB_model(X_train_features_filtered_kbest_one, y_train, X_test_features_filtered_kbest_one, y_test)

Model Accuracy: 0.5229
Classification Report:
               precision    recall  f1-score   support

    negative     0.5305    0.4346    0.4778       260
     neutral     0.5169    0.7840    0.6230       449
    positive     0.5814    0.1096    0.1845       228

    accuracy                         0.5229       937
   macro avg     0.5429    0.4427    0.4284       937
weighted avg     0.5364    0.5229    0.4760       937

Micro Precision: 0.5229
Micro Recall: 0.5229
Micro F1-Score: 0.5229


### Task 3 (feature selection 2): Select top k-best features using information gain (mutual information) k=2000 ###

In [42]:
### Task 3 (feature selection 2): Select top k-best features using information gain (mutual information) k=2000 ###
X_train_features_filtered_kbest_two, X_test_features_filtered_kbest_two = mutual_information_k(2000)

Train feature space before filtering:  (1181, 5038)
Train feature space after filtering:  (1181, 2000)
Test feature space before filtering:  (937, 5038)
Test feature space after filtering:  (937, 2000)


In [43]:
multinomialNB_model(X_train_features_filtered_kbest_two, y_train, X_test_features_filtered_kbest_two, y_test)

Model Accuracy: 0.5486
Classification Report:
               precision    recall  f1-score   support

    negative     0.6331    0.3385    0.4411       260
     neutral     0.5253    0.8797    0.6578       449
    positive     0.6739    0.1360    0.2263       228

    accuracy                         0.5486       937
   macro avg     0.6108    0.4514    0.4417       937
weighted avg     0.5914    0.5486    0.4927       937

Micro Precision: 0.5486
Micro Recall: 0.5486
Micro F1-Score: 0.5486


### Task 4 (feature selection 3): Lexicon-based feature selection ###

In [44]:
### Task 4 (feature selection 3): Lexicon-based feature selection ###
# Load positive and negative words from Hu & Liu lexicon
def load_lexicon(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        words = set(word.strip() for word in file.readlines())
    return words

# Update these paths with the actual file locations
positive_words = load_lexicon("positive_words_list.txt")
negative_words = load_lexicon("negative_words_list.txt")

In [45]:
def lexicon_filter(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    return " ".join([word for word in tokens if word in positive_words or word in negative_words])

train_data["filtered_text"] = train_data["Tweet"].apply(lexicon_filter)
test_data["filtered_text"] = test_data["Tweet"].apply(lexicon_filter)

print("Example before filtering:", train_data["Tweet"].iloc[0])
print("Example after filtering:", train_data["filtered_text"].iloc[0])
print(train_data.shape)

Example before filtering: @liamch88 yeah! :) playing well 
Example after filtering: well
(1181, 3)


In [46]:
X_train_lexi, y_train = train_data["filtered_text"], train_data["Sentiment"]
X_test_lexi, y_test = test_data["filtered_text"], test_data["Sentiment"]

vectorizer = CountVectorizer(ngram_range=(1, 1))  # Unigrams
X_train_features_lexi = vectorizer.fit_transform(X_train_lexi)
X_test_features_lexi = vectorizer.transform(X_test_lexi)

print ("Train feature space : ", X_train_features_lexi.shape)

print ("Test feature space : ", X_test_features_lexi.shape)

Train feature space :  (1181, 678)
Test feature space :  (937, 678)


In [47]:
multinomialNB_model(X_train_features_lexi, y_train, X_test_features_lexi, y_test)

Model Accuracy: 0.5656
Classification Report:
               precision    recall  f1-score   support

    negative     0.6307    0.4269    0.5092       260
     neutral     0.5413    0.7884    0.6419       449
    positive     0.6075    0.2851    0.3881       228

    accuracy                         0.5656       937
   macro avg     0.5931    0.5001    0.5130       937
weighted avg     0.5822    0.5656    0.5433       937

Micro Precision: 0.5656
Micro Recall: 0.5656
Micro F1-Score: 0.5656


### Task 6: Extract and select best unigrams ###

In [48]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

vectorizer = CountVectorizer(ngram_range=(1,1))
X_train_unigrams = vectorizer.fit_transform(train_data["Tweet"])
X_test_unigrams = vectorizer.transform(test_data["Tweet"])
mutual_info = mutual_info_classif(X_train_unigrams, y_train, discrete_features=True)

#top 2000 unigrams
selector = SelectKBest(score_func=mutual_info_classif, k=2000)
X_train_selected = selector.fit_transform(X_train_unigrams, y_train)

### Task 7: Train and evaluate a Naive Bayes classifier using cross-validation ###
### Task 8: Train a linear SVM classifier ###
### Task 9: Train a logistic regression classifier ###

In [49]:
import math
import numpy as np
from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

metrics_nb = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
metrics_svm = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
metrics_lr = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

i = 0

for train_index, test_index in skf.split(X_train_selected, y_train):
    X_train, X_test = X_train_selected[train_index], X_train_selected[test_index]
    Y_train, Y_test = y_train[train_index], y_train[test_index]

    #### Task 7: Naive Bayes ####
    nb_model = MultinomialNB()
    nb_model.fit(X_train, Y_train)
    preds_nb = nb_model.predict(X_test)

    metrics_nb['accuracy'].append(accuracy_score(Y_test, preds_nb))
    metrics_nb['precision'].append(precision_score(Y_test, preds_nb, average="weighted"))
    metrics_nb['recall'].append(recall_score(Y_test, preds_nb, average="weighted"))
    metrics_nb['f1'].append(f1_score(Y_test, preds_nb, average="weighted"))

    #### Task 8: Linear SVM ####
    svm_model = LinearSVC()
    svm_model.fit(X_train, Y_train)
    preds_svm = svm_model.predict(X_test)

    metrics_svm['accuracy'].append(accuracy_score(Y_test, preds_svm))
    metrics_svm['precision'].append(precision_score(Y_test, preds_svm, average="weighted"))
    metrics_svm['recall'].append(recall_score(Y_test, preds_svm, average="weighted"))
    metrics_svm['f1'].append(f1_score(Y_test, preds_svm, average="weighted"))

    #### Task 9: Logistic Regression ####
    lr_model = LogisticRegression()
    lr_model.fit(X_train, Y_train)
    preds_lr = lr_model.predict(X_test)

    metrics_lr['accuracy'].append(accuracy_score(Y_test, preds_lr))
    metrics_lr['precision'].append(precision_score(Y_test, preds_lr, average="weighted"))
    metrics_lr['recall'].append(recall_score(Y_test, preds_lr, average="weighted"))
    metrics_lr['f1'].append(f1_score(Y_test, preds_lr, average="weighted"))

    print(f"Fold {i+1} Results:")
    print(f"  Naive Bayes - Accuracy: {metrics_nb['accuracy'][-1]:.4f}, Weighted Precision: {metrics_nb['precision'][-1]:.4f}, Weighted Recall: {metrics_nb['recall'][-1]:.4f}, Weighted F1: {metrics_nb['f1'][-1]:.4f}")
    print(f"  SVM - Accuracy: {metrics_svm['accuracy'][-1]:.4f}, Weighted Precision: {metrics_svm['precision'][-1]:.4f}, Weighted Recall: {metrics_svm['recall'][-1]:.4f}, Weighted F1: {metrics_svm['f1'][-1]:.4f}")
    print(f"  Logistic Regression - Accuracy: {metrics_lr['accuracy'][-1]:.4f}, Weighted Precision: {metrics_lr['precision'][-1]:.4f}, Weighted Recall: {metrics_lr['recall'][-1]:.4f}, Weighted F1: {metrics_lr['f1'][-1]:.4f}\n")
#     print(f"  Naive Bayes - F1: {metrics_nb['f1'][-1]:.4f}")
#     print(f"  SVM - F1: {metrics_svm['f1'][-1]:.4f}")
#     print(f"  Logistic Regression - F1: {metrics_lr['f1'][-1]:.4f}\n")
    
    i += 1

def confidence_interval(scores):
    mean_acc = np.mean(scores)
    std_err = stats.sem(scores)  
    z_score = 1.959 
    margin_of_error = z_score * std_err
    return mean_acc - margin_of_error, mean_acc + margin_of_error

ci_nb = confidence_interval(metrics_nb['accuracy'])
ci_svm = confidence_interval(metrics_svm['accuracy'])
ci_lr = confidence_interval(metrics_lr['accuracy'])

print(f"Naive Bayes Accuracy CI: {ci_nb}")
print(f"SVM Accuracy CI: {ci_svm}")
print(f"Logistic Regression Accuracy CI: {ci_lr}")


def report_results(model_name, metrics):
    print(f"\n{model_name} Results:")
    print(f"  Mean Accuracy: {np.mean(metrics['accuracy']):.4f}")
    print(f"  Mean Precision: {np.mean(metrics['precision']):.4f}")
    print(f"  Mean Recall: {np.mean(metrics['recall']):.4f}")
    print(f"  Mean F1-score: {np.mean(metrics['f1']):.4f}")

report_results("Naive Bayes", metrics_nb)
report_results("SVM", metrics_svm)
report_results("Logistic Regression", metrics_lr)


Fold 1 Results:
  Naive Bayes - Accuracy: 0.6582, Weighted Precision: 0.7064, Weighted Recall: 0.6582, Weighted F1: 0.6301
  SVM - Accuracy: 0.6751, Weighted Precision: 0.6849, Weighted Recall: 0.6751, Weighted F1: 0.6638
  Logistic Regression - Accuracy: 0.6414, Weighted Precision: 0.6553, Weighted Recall: 0.6414, Weighted F1: 0.6215

Fold 2 Results:
  Naive Bayes - Accuracy: 0.7373, Weighted Precision: 0.7826, Weighted Recall: 0.7373, Weighted F1: 0.7165
  SVM - Accuracy: 0.6483, Weighted Precision: 0.6490, Weighted Recall: 0.6483, Weighted F1: 0.6401
  Logistic Regression - Accuracy: 0.6398, Weighted Precision: 0.6468, Weighted Recall: 0.6398, Weighted F1: 0.6289

Fold 3 Results:
  Naive Bayes - Accuracy: 0.6653, Weighted Precision: 0.7261, Weighted Recall: 0.6653, Weighted F1: 0.6353
  SVM - Accuracy: 0.6695, Weighted Precision: 0.6840, Weighted Recall: 0.6695, Weighted F1: 0.6612
  Logistic Regression - Accuracy: 0.6356, Weighted Precision: 0.6505, Weighted Recall: 0.6356, Weighte

### Task 10: Model comparison using paired t-test

In [50]:
##Task 10: Model comparison using paired t-test
from scipy import stats

svm_nb_ttest = stats.ttest_ind(metrics_svm['f1'], metrics_nb['f1'])
nb_lr_ttest = stats.ttest_ind(metrics_nb['f1'],metrics_lr['f1'])
svm_lr_ttest = stats.ttest_ind(metrics_svm['f1'],metrics_lr['f1'])
print("LinearSVC vs. Naive Bayes t-test result: ", svm_nb_ttest)
print("Naive Bayes vs. Logistic Regression t-test result: ", svm_lr_ttest)
print("LinearSVC vs. Logistic Regression t-test result: ", nb_lr_ttest) 

LinearSVC vs. Naive Bayes t-test result:  TtestResult(statistic=-0.19455101877313374, pvalue=0.8505941842536587, df=8.0)
Naive Bayes vs. Logistic Regression t-test result:  TtestResult(statistic=2.1193282956991033, pvalue=0.06689509685880549, df=8.0)
LinearSVC vs. Logistic Regression t-test result:  TtestResult(statistic=1.3720799611337335, pvalue=0.20727756354838042, df=8.0)


### Task 11

In [51]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

svm = LinearSVC(dual=False, max_iter=5000)

param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_features, y_train)
best_C = grid_search.best_params_['C']
print(f'Best regularization parameter (C): {best_C}')

best_svm = LinearSVC(C=best_C, dual=False, max_iter=5000)
best_svm.fit(X_train_features, y_train)

y_pred = best_svm.predict(X_test_features)

report = classification_report(y_test, y_pred, output_dict=True)
weighted_precision = report['weighted avg']['precision']
weighted_recall = report['weighted avg']['recall']
weighted_f1 = report['weighted avg']['f1-score']

print(f'Weighted Precision: {weighted_precision:.4f}')
print(f'Weighted Recall: {weighted_recall:.4f}')
print(f'Weighted F1 Score: {weighted_f1:.4f}')


Best regularization parameter (C): 1
Weighted Precision: 0.5689
Weighted Recall: 0.5550
Weighted F1 Score: 0.5511
