<a href="https://colab.research.google.com/github/RafaelNovais/MasterAI/blob/master/WordEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import os

# Load GloVe embeddings (Assuming 'glove.6B.300d.txt' is in your directory)
def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Convert each sentence to its corresponding sentence vector (average of word vectors)
def sentence_to_avg_vector(sentence, embeddings_index, embedding_dim=300):
    words = sentence.split()
    valid_word_embeddings = [embeddings_index[word] for word in words if word in embeddings_index]

    if len(valid_word_embeddings) == 0:
        return np.zeros(embedding_dim)

    return np.mean(valid_word_embeddings, axis=0)

# Feature extraction function using GloVe embeddings
def extract_features_glove(df, embeddings_index, embedding_dim=300):
    df['text'] = df['text'].fillna('')  # Handle missing text values
    X = np.array([sentence_to_avg_vector(text, embeddings_index, embedding_dim) for text in df['text']])

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['sentiment'])

    return X, y, label_encoder

# Train the SVM model
def get_model(train_x, train_y):
    model = SVC(kernel='linear', C=1.0, random_state=42)
    model.fit(train_x, train_y)
    return model

# Make predictions on the test set
def make_predictions_on_test(test_x, model):
    predictions = model.predict(test_x)
    return predictions

# Calculate precision, recall, and F1-score
def calc_pre_recall_f1(pred_y, test_y):
    precision = precision_score(test_y, pred_y, average='weighted')
    recall = recall_score(test_y, pred_y, average='weighted')
    f1 = f1_score(test_y, pred_y, average='weighted')
    return precision, recall, f1

# MAIN WORKFLOW
if __name__ == "__main__":
    # Load GloVe embeddings
    glove_file_path = 'path_to_glove/glove.6B.300d.txt'
    embeddings_index = load_glove_embeddings(glove_file_path)

    # Load datasets
    train_df = pd.read_csv('path_to_train.csv')
    dev_df = pd.read_csv('path_to_dev.csv')
    test_df = pd.read_csv('path_to_test.csv')

    # Extract features using GloVe embeddings
    embedding_dim = 300  # GloVe embeddings are 300-dimensional
    train_x, train_y, label_encoder = extract_features_glove(train_df, embeddings_index, embedding_dim)
    dev_x, dev_y, _ = extract_features_glove(dev_df, embeddings_index, embedding_dim)
    test_x, test_y, _ = extract_features_glove(test_df, embeddings_index, embedding_dim)

    # Train the SVM model
    model = get_model(train_x, train_y)

    # Make predictions on the test set
    predictions = make_predictions_on_test(test_x, model)

    # Calculate evaluation metrics
    precision, recall, f1 = calc_pre_recall_f1(predictions, test_y)

    # Output evaluation metrics
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    # Additional accuracy and classification report
    print("Test Accuracy:", accuracy_score(test_y, predictions))
    print("Test Classification Report:\n", classification_report(test_y, predictions))
