In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import xgboost as xgb
from collections import Counter
import math
import pickle

# Load the new dataset
new_df = pd.read_csv('preprocessed_news_data.csv')

# Ensure the 'processed_text' column exists
if 'processed_text' not in new_df.columns:
    raise ValueError("The input CSV must contain a 'processed_text' column.")

# Tokenize the processed text
new_df['tokens'] = new_df['processed_text'].apply(lambda x: x.split())

# Load the trained Word2Vec model
w2v_model = Word2Vec.load("word2vec_model.model")

# Calculate IDF values using only the new dataset
def calculate_idf(corpus):
    total_documents = len(corpus)
    word_document_count = Counter()
    for document in corpus:
        unique_words = set(document)
        for word in unique_words:
            word_document_count[word] += 1
    idf_values = {word: math.log(total_documents / (1 + count)) for word, count in word_document_count.items()}
    return idf_values

# Use the new dataset to calculate IDF values
new_corpus = new_df['tokens'].tolist()
idf_values = calculate_idf(new_corpus)

# Define function to calculate TF-IDF weighted document vectors
def calculate_tfidf_weighted_doc_vector(tokens, idf_values, w2v_model):
    tf_values = Counter(tokens)
    total_terms = len(tokens)
    tf_values = {word: count / total_terms for word, count in tf_values.items()}
    weighted_sum = np.zeros(w2v_model.vector_size)
    total_weight = 0.0
    for token in tokens:
        if token in w2v_model.wv and token in idf_values:
            tfidf_score = tf_values[token] * idf_values[token]
            weighted_sum += w2v_model.wv[token] * tfidf_score
            total_weight += tfidf_score
    if total_weight > 0:
        return weighted_sum / total_weight
    else:
        return np.zeros(w2v_model.vector_size)

# Generate document vectors for the new dataset
new_doc_vectors = []
for tokens in new_df['tokens']:
    doc_vector = calculate_tfidf_weighted_doc_vector(tokens, idf_values, w2v_model)
    new_doc_vectors.append(doc_vector)

new_doc_vectors = np.array(new_doc_vectors)

# Load the trained XGBoost model from pickle file
with open('xgboost_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

# Convert document vectors to DMatrix format for prediction
dnew = xgb.DMatrix(new_doc_vectors)

# Predict sentiment scores using the trained XGBoost model
new_df['sentiment_score'] = xgb_model.predict(dnew)

# Save the updated DataFrame with sentiment scores to a new CSV file
new_df.to_csv('new_articles_with_sentiment.csv', index=False)

print("Sentiment scores have been added and saved to 'new_articles_with_sentiment.csv'.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset with sentiment scores
df = pd.read_csv('new_articles_with_sentiment.csv')

# Calculate EWMA with different span values (adjust these as needed)
spans = [3,5, 10,15]  # Different window sizes for comparison

# Create new columns for each EWMA calculation
for span in spans:
    column_name = f'ewma_{span}'
    df[column_name] = df['sentiment_score'].ewm(span=span).mean()

# Save the updated DataFrame with EWMA columns
df.to_csv('new_articles_with_ewma.csv', index=False)
print(f"EWMA calculations added to the dataset and saved to 'new_articles_with_ewma.csv'")

In [None]:
# Create the plot
plt.figure(figsize=(12, 6))

# Plot the original sentiment scores
plt.plot(df.index, df['sentiment_score'], label='Raw Sentiment Score', color='gray', alpha=0.6)

# Plot each EWMA line
colors = ['blue', 'red', 'green']
for span, color in zip(spans, colors):
    column_name = f'ewma_{span}'
    plt.plot(df.index, df[column_name], label=f'EWMA (span={span})', color=color, linewidth=2)

# Add chart details
plt.title('Sentiment Scores with EWMA Smoothing', fontsize=16)
plt.xlabel('Article Index', fontsize=12)
plt.ylabel('Sentiment Score', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)

# Save the plot
plt.tight_layout()
plt.savefig('sentiment_ewma_comparison.png', dpi=300)
plt.show()

print("Graph saved as 'sentiment_ewma_comparison.png'")