In [None]:
!pip install gensim
!pip install -upgrade numpy

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from collections import Counter
import math

# Load your preprocessed CSV file
df = pd.read_csv('labelled_news_sentiment.csv')

# Tokenize the processed text
df['tokens'] = df['processed_text'].apply(lambda x: x.split())

print("Sample tokens:", df['tokens'].head())

# Prepare the corpus: a list of token lists
corpus = df['tokens'].tolist()

# Train the Word2Vec model
w2v_model = Word2Vec(sentences=corpus,
                     vector_size=500,    # Dimension of the word vectors
                     window=10,           # Context window size
                     min_count=1,        # Minimum frequency for a word to be considered
                     workers=10,          # Number of threads for training
                     epochs=30)

print("Word2Vec model trained.")

# Calculate IDF values for all words in the corpus
def calculate_idf(corpus):
    """
    Calculate IDF (Inverse Document Frequency) for each word in the corpus.
    """
    total_documents = len(corpus)
    word_document_count = Counter()

    # Count how many documents each word appears in
    for document in corpus:
        unique_words = set(document)
        for word in unique_words:
            word_document_count[word] += 1

    # Calculate IDF using the formula: log(N / (1 + df))
    idf_values = {word: math.log(total_documents / (1 + count)) for word, count in word_document_count.items()}
    return idf_values

idf_values = calculate_idf(corpus)

def calculate_tfidf_weighted_doc_vector(tokens, idf_values, w2v_model):
    """
    Calculate TF-IDF weighted average of Word2Vec vectors for a document.

    Parameters:
    - tokens: List of tokens in the document.
    - idf_values: Precomputed IDF values for all words.
    - w2v_model: Trained Word2Vec model.

    Returns:
    - Weighted average vector representation of the document.
    """
    # Calculate term frequency (TF) for this document
    tf_values = Counter(tokens)
    total_terms = len(tokens)

    # Normalize TF by dividing by total terms in the document
    tf_values = {word: count / total_terms for word, count in tf_values.items()}

    # Initialize variables for weighted vector calculation
    weighted_sum = np.zeros(w2v_model.vector_size)
    total_weight = 0.0

    # Calculate TF-IDF weighted vectors
    for token in tokens:
        if token in w2v_model.wv and token in idf_values:
            tfidf_score = tf_values[token] * idf_values[token]  # TF-IDF score
            weighted_sum += w2v_model.wv[token] * tfidf_score  # Weighted vector sum
            total_weight += tfidf_score

    # Return normalized weighted average vector or zero vector if no valid tokens are found
    if total_weight > 0:
        return weighted_sum / total_weight
    else:
        return np.zeros(w2v_model.vector_size)

# Generate document vectors using TF-IDF weighting
doc_vectors = []
for tokens in df['tokens']:
    doc_vector = calculate_tfidf_weighted_doc_vector(tokens, idf_values, w2v_model)
    doc_vectors.append(doc_vector)

doc_vectors = np.array(doc_vectors)
print("Document vectors shape:", doc_vectors.shape)

X = doc_vectors
y = df['news_sentiment'].values



from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

# Convert to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost regression
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train the model with early stopping
xgb_model = xgb.train(params, dtrain, num_boost_round=100,
                      evals=[(dtrain, 'train'), (dtest, 'test')],
                      early_stopping_rounds=10, verbose_eval=True)

# Use the trained model for predictions
y_pred = xgb_model.predict(dtest)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("\nFinal Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
maa = np.mean(np.abs(y_test))
print(f"Mean Absolute Average (MAA): {maa:.4f}")