In [1]:
import pandas as pd
import numpy as np
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2  # Import SelectKBest and chi2
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import gensim
from gensim.models import KeyedVectors
import kagglehub


In [2]:
# Download latest version
path = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\urbi1\.cache\kagglehub\datasets\thanakomsn\glove6b300dtxt\versions\1


In [3]:
data = pd.read_csv('../Model Implementation/data/train_t5.csv')

In [4]:
# Download necessary NLTK resources
download('punkt')  # For tokenization
download('stopwords')  # For stopwords

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urbi1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\urbi1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess_text(text):
    # lowercase 
    text = text.lower()

    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # lemmatization (using SpaCy)
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]

    # return to string 
    return " ".join(lemmatized_tokens)

In [18]:
#data = pd.read_csv('../Model Implementation/data/train_t5.csv')
#data['processed_text'] = data['text'].apply(preprocess_text)

In [19]:
#test_data = pd.read_csv('../Model Implementation/data/test_t5.csv')
#test_data['processed_text'] = test_data['text'].apply(preprocess_text)

In [20]:
#data.to_csv('../Model Implementation/data/temp/train.csv')
#test_data.to_csv('../Model Implementation/data/temp/test.csv')

In [7]:
data = pd.read_csv('../Model Implementation/data/temp/train.csv')
test_data = pd.read_csv('../Model Implementation/data/temp/test.csv')

# APPLYING GENSIM FOR WORD-LEVEL EMBEDDINGS

In [11]:
# Load GloVe embeddings
def load_glove_model(glove_file):
    print("Loading GloVe model...")
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    print("GloVe model loaded.")
    return glove_model

# Load GloVe embeddings (assuming you have converted them to Word2Vec format)
glove_file = path + '\glove.6B.300d.txt' # Adjust the path to your GloVe file
glove_model = load_glove_model(glove_file)

# Alternatively, for Word2Vec pre-trained model
# word2vec_model = KeyedVectors.load_word2vec_format('path/to/GoogleNews-vectors-negative300.bin', binary=True)


Loading GloVe model...


GloVe model loaded.


In [23]:
#full_data.to_csv('../Model Implementation/data/temp/full.csv')

In [14]:
def get_sentence_vector(sentence, glove_model):
    # Tokenize the sentence
    words = word_tokenize(sentence)
    word_vectors = []
    
    for word in words:
        if word in glove_model:  # Check if the word is in the GloVe model
            word_vectors.append(glove_model[word])  # Append the word vector
    
    if not word_vectors:  # Handle cases with no valid words
        return np.zeros(300)  # Return a zero vector if no words are found (300 is the dimension of GloVe vectors)
    
    # Average the word vectors to create a fixed-length representation
    return np.mean(word_vectors, axis=0)

# Example usage
data['sentence_vector'] = data['processed_text'].apply(lambda x: get_sentence_vector(x, glove_model))
test_data['sentence_vector'] = test_data['processed_text'].apply(lambda x: get_sentence_vector(x, glove_model))


In [16]:
# Convert the post embeddings to a NumPy array for model training
X_train = np.array(data['sentence_vector'].tolist())
y_train = data['label'].values  # Assuming your labels are in a column named 'label'

# Convert the post embeddings to a NumPy array for model training
X_test = np.array(test_data['sentence_vector'].tolist())
y_test = test_data['label'].values  # Assuming your labels are in a column named 'label'


In [31]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [32]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Train the SVM model with class weights
model = SVC(kernel='rbf', class_weight=class_weight_dict)
model.fit(X_train, y_train)  # y_train is your target labels

In [33]:
predi = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predi)}")

print("Classification Report:")
print(classification_report(y_test, predi))

Accuracy: 0.643866960856419
Classification Report:
              precision    recall  f1-score   support

       anger       0.65      0.61      0.63      7417
        fear       0.45      0.61      0.52      3014
         joy       0.79      0.68      0.73     10865
        love       0.30      0.21      0.25       122
     sadness       0.51      0.67      0.58      2804
    surprise       0.36      0.42      0.38       252

    accuracy                           0.64     24474
   macro avg       0.51      0.53      0.52     24474
weighted avg       0.67      0.64      0.65     24474

