In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('spacy_yangswei_85_lemmatized.csv')
test_data = pd.read_csv('spacy_yangswei_85_lemmatized_test.csv')

In [3]:
pip install gensim




In [4]:
import gensim
import numpy as np

model_path = 'wiki-news-300d-1M.vec'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

In [5]:
def get_word_vector(word, model):
    try:
        vector = model[word]
        return vector
    except KeyError:
        return None 

In [6]:
def get_sentence_vector(sentence, model):
    words = sentence.split()  
    word_vectors = []
    
    for word in words:
        vector = get_word_vector(word, model)
        if vector is not None:
            word_vectors.append(vector)
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  

    return np.mean(word_vectors, axis=0)

In [7]:
data['sentence_vector'] = data['processed_text'].apply(lambda x: get_sentence_vector(x, model))
test_data['sentence_vector'] = test_data['processed_text'].apply(lambda x: get_sentence_vector(x, model))

In [10]:
data.to_csv('fasttext_yangswei85_train.csv')
test_data.to_csv('fasttext_yangswei85_test.csv')

In [12]:
# convert the post embeddings to a numPy array for model training
X_train = np.array(data['sentence_vector'].tolist())
y_train = data['label'].values  

# convert the post embeddings to a numPy array for model training
X_test = np.array(test_data['sentence_vector'].tolist())
y_test = test_data['label'].values  

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
import numpy as np

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print(class_weight_dict)

{'anger': 1.1271081443272901, 'fear': 3.2154656279304623, 'joy': 0.27820529623596524, 'love': 22.650406504065042, 'sadness': 0.9267552339965488, 'surprise': 11.886933333333333}


## SVM nonlinear rbf, without hyperparameters tuning

In [15]:
svm_model = SVC(kernel='rbf', class_weight=class_weight_dict)  
svm_model.fit(X_train, y_train)

In [16]:
predi = svm_model.predict(X_test)

NameError: name 'accuracy_score' is not defined

In [17]:
from sklearn.metrics import classification_report, accuracy_score
print(f"Accuracy: {accuracy_score(y_test, predi)}")

print("Classification Report:")
print(classification_report(y_test, predi))

Accuracy: 0.6526537754049082
Classification Report:
              precision    recall  f1-score   support

       anger       0.47      0.61      0.53      3296
        fear       0.33      0.53      0.41      1155
         joy       0.85      0.69      0.76     13353
        love       0.23      0.20      0.21       164
     sadness       0.53      0.62      0.57      4008
    surprise       0.32      0.47      0.38       313

    accuracy                           0.65     22289
   macro avg       0.46      0.52      0.48     22289
weighted avg       0.70      0.65      0.67     22289



## Non linear SVM, kernel rbf, using hyperparameters tuning

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

In [41]:
param_grid = {
    'C': loguniform(0.01, 100)}

optr_search_rbf = RandomizedSearchCV(estimator=svm_model, param_distributions=param_grid, 
                                 cv=2, n_iter=2, n_jobs=-1, verbose=True)

optr_search_rbf.fit(X_train, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [42]:
optr_search_rbf.best_params_

{'C': 10.676019997389606}

In [43]:
best_model_rbf = optr_search_rbf.best_estimator_

In [45]:
y_pred_rbf = best_model_rbf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rbf)}")

print("Classification Report:")
print(classification_report(y_test, y_pred_rbf))

Accuracy: 0.6998070797254251
Classification Report:
              precision    recall  f1-score   support

       anger       0.53      0.50      0.51      3296
        fear       0.49      0.36      0.41      1155
         joy       0.79      0.83      0.81     13353
        love       0.75      0.11      0.19       164
     sadness       0.56      0.57      0.57      4008
    surprise       0.61      0.32      0.42       313

    accuracy                           0.70     22289
   macro avg       0.62      0.45      0.49     22289
weighted avg       0.69      0.70      0.69     22289



## Save pickle file model

In [46]:
import pickle

def save_model(model, filename):
    try:
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
        print(f"save sucessfully '{filename}'")
    except Exception as e:
        print(f"Error: {e}")

In [47]:
save_model(best_model_rbf, 'SVM_rbf_fasttext_weight_yangswei85.pkl')

save sucessfully 'SVM_rbf_fasttext_weight_yangswei85.pkl'
