In [None]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from collections import Counter
import math

# Install required packages
!pip install gensim
!pip install --upgrade numpy

# Load your preprocessed CSV file
df = pd.read_csv('labelled_news_sentiment.csv')

# Tokenize the processed text
df['tokens'] = df['processed_text'].apply(lambda x: x.split())

print("Sample tokens:", df['tokens'].head())

# Prepare documents for Doc2Vec - each document needs to be tagged with an index
tagged_data = [TaggedDocument(words=tokens, tags=[str(i)]) for i, tokens in enumerate(df['tokens'])]

# Train the Doc2Vec model
d2v_model = Doc2Vec(vector_size=100,    # Dimension of the document vectors
                    window=5,           # Context window size
                    min_count=1,         # Minimum frequency for a word to be considered
                    workers=10,          # Number of threads for training
                    epochs=30,           # Number of training epochs
                    dm=1)                # Training algorithm: 1 = distributed memory (PV-DM); 0 = distributed bag of words (PV-DBOW)

# Build vocabulary
d2v_model.build_vocab(tagged_data)

# Train the model
d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

print("Doc2Vec model trained.")

# Generate document vectors using the trained model
doc_vectors = np.array([d2v_model.dv[str(i)] for i in range(len(df))])

print("Document vectors shape:", doc_vectors.shape)

# Set up features and target
X = doc_vectors
y = df['news_sentiment'].values