In [None]:
!pip install gensim
!pip install -upgrade numpy

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from collections import Counter
import math

# Load your preprocessed CSV file
df = pd.read_csv('labelled_news_sentiment.csv')

# Tokenize the processed text
df['tokens'] = df['processed_text'].apply(lambda x: x.split())

print("Sample tokens:", df['tokens'].head())

# Prepare the corpus: a list of token lists
corpus = df['tokens'].tolist()

# Train the Word2Vec model
w2v_model = Word2Vec(sentences=corpus,
                     vector_size=100,    # Dimension of the word vectors
                     window=7,           # Context window size
                     min_count=1,        # Minimum frequency for a word to be considered
                     workers=10,          # Number of threads for training
                     epochs=30)

print("Word2Vec model trained.")

def document_vector(tokens, model):
    # Filter tokens to include only those present in the model's vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    if valid_tokens:
        # Compute the mean of the word vectors
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        # If no valid tokens, return a zero vector
        return np.zeros(model.vector_size)

# Generate document vectors
doc_vectors = np.array([document_vector(tokens, w2v_model) for tokens in df['tokens']])


print("Document vectors shape:", doc_vectors.shape)

X = doc_vectors
y = df['news_sentiment'].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

# Convert to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost regression
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train the model with early stopping
xgb_model = xgb.train(params, dtrain, num_boost_round=100,
                      evals=[(dtrain, 'train'), (dtest, 'test')],
                      early_stopping_rounds=10, verbose_eval=True)

# Use the trained model for predictions
y_pred = xgb_model.predict(dtest)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("\nFinal Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
maa = np.mean(np.abs(y_test))
print(f"Mean Absolute Average (MAA): {maa:.4f}")