In [30]:
import pandas as pd

In [2]:
data = pd.read_csv('t5_lemized.csv')
test_data = pd.read_csv('t5_lemized_test.csv')

In [3]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [4]:
import gensim
import numpy as np

model_path = 'wiki-news-300d-1M.vec'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

In [5]:
def get_word_vector(word, model):
    try:
        vector = model[word]
        return vector
    except KeyError:
        return None 

In [6]:
def get_sentence_vector(sentence, model):
    words = sentence.split()  
    word_vectors = []
    
    for word in words:
        vector = get_word_vector(word, model)
        if vector is not None:
            word_vectors.append(vector)
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  

    return np.mean(word_vectors, axis=0)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,processed_text
0,0,Does anyone use their own computer question as...,joy,anyone use computer question remote software e...
1,1,WFH is getting to be...ehhhh. thats not even a...,joy,wfh get ehhhh that s even downside good part s...
2,2,everything is awesome …Are annual employee sat...,joy,everything awesome annual employee satisfactio...
3,3,Remote Workers Beware: US Entrepreneur Warns ...,fear,remote worker beware we entrepreneur warn work...
4,4,Teams & Slack Users: Please just ask the quest...,sadness,team slack user please ask question stop leave...


In [8]:
data['sentence_vector'] = data['processed_text'].apply(lambda x: get_sentence_vector(x, model))
test_data['sentence_vector'] = test_data['processed_text'].apply(lambda x: get_sentence_vector(x, model))

In [10]:
data.to_csv('fasttext_t5_train.csv')
test_data.to_csv('fasttext_t5_test.csv')

In [11]:
# convert the post embeddings to a numPy array for model training
X_train = np.array(data['sentence_vector'].tolist())
y_train = data['label'].values  

# convert the post embeddings to a numPy array for model training
X_test = np.array(test_data['sentence_vector'].tolist())
y_test = test_data['label'].values  

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print(class_weight_dict)

{'anger': 0.5499713486365322, 'fear': 1.3535755765720923, 'joy': 0.3753997653176265, 'love': 33.503080082135526, 'sadness': 1.4545778728715342, 'surprise': 16.186507936507937}


## SVM nonlinear, kernel rbf

In [15]:
svm_model = SVC(kernel='rbf', class_weight=class_weight_dict)  
svm_model.fit(X_train, y_train)

In [16]:
predi = svm_model.predict(X_test)

In [17]:
from sklearn.metrics import classification_report, accuracy_score
print(f"Accuracy: {accuracy_score(y_test, predi)}")

print("Classification Report:")
print(classification_report(y_test, predi))

Accuracy: 0.6663397891640108
Classification Report:
              precision    recall  f1-score   support

       anger       0.67      0.63      0.65      7417
        fear       0.47      0.64      0.54      3014
         joy       0.81      0.70      0.75     10865
        love       0.37      0.21      0.27       122
     sadness       0.53      0.69      0.60      2804
    surprise       0.39      0.45      0.42       252

    accuracy                           0.67     24474
   macro avg       0.54      0.56      0.54     24474
weighted avg       0.69      0.67      0.67     24474



## Using Hyperparameters tuning

In [25]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [26]:
param_grid = {
    'C': loguniform(0.01, 100),  
    'gamma': ['scale', 'auto']  
}

optr_search = RandomizedSearchCV(
    estimator=svm_model, 
    param_distributions=param_grid, 
    cv=2, 
    n_iter=2, 
    n_jobs=-1, 
    verbose=True
)

optr_search.fit(X_train, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [27]:
optr_search.best_params_

{'C': 10.999382679501826, 'gamma': 'auto'}

In [33]:
best_model_rbf = optr_search.best_estimator_

In [28]:
optr_predictions = optr_search.best_estimator_.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, optr_predictions)}")

print("Classification Report:")
print(classification_report(y_test, optr_predictions))

Accuracy: 0.6921631118738253
Classification Report:
              precision    recall  f1-score   support

       anger       0.64      0.70      0.67      7417
        fear       0.57      0.50      0.53      3014
         joy       0.77      0.78      0.78     10865
        love       0.70      0.11      0.20       122
     sadness       0.65      0.58      0.62      2804
    surprise       0.67      0.34      0.45       252

    accuracy                           0.69     24474
   macro avg       0.67      0.50      0.54     24474
weighted avg       0.69      0.69      0.69     24474



# Save pickle file model

In [32]:
import pickle

def save_model(model, filename):
    try:
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
        print(f"save sucessfully '{filename}'")
    except Exception as e:
        print(f"Error: {e}")

In [35]:
save_model(best_model_rbf, 'SVM_rbf_fasttext_weight_t5.pkl')

save sucessfully 'SVM_rbf_fasttext_weight_t5.pkl'
