In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import nltk
from nltk.corpus import stopwords

from transformers import BertTokenizer, BertForSequenceClassification, pipeline


In [None]:
df = pd.read_csv("data/rss_articles.csv")

df = df[['timestamp', 'title', 'text_body','source',]]

df['Date'] = pd.to_datetime(df['timestamp']).dt.date
df = df.dropna(subset=['text_body'])
df.head()


Unnamed: 0,timestamp,title,text_body,source,Date
0,2025-11-01 16:07:45+05:30,Dalal Street Week Ahead: Technical charts sign...,"Indian markets traded rangebound last week, en...",ECONOMIC TIMES,2025-11-01
1,2025-11-01 16:05:09+05:30,F&amp;O Talk| Nifty logs 11 sessions of tight ...,Markets ended lower after a four-week rally du...,ECONOMIC TIMES,2025-11-01
2,2025-11-01 15:58:46+05:30,"CarTrade Tech, Chennai Petro among 10 smallcap...","Markets ended their four-week winning streak, ...",ECONOMIC TIMES,2025-11-01
3,2025-11-01 15:45:33+05:30,"CarTrade Tech, Chennai Petro among 10 smallcap...","Amid the volatility, the broader indices conti...",ECONOMIC TIMES,2025-11-01
4,2025-11-01 15:17:06+05:30,Sectoral and thematic mutual funds outperform ...,"Motilal Oswal Nasdaq 100 FOF, the topper in th...",ECONOMIC TIMES,2025-11-01


In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clear_text(x):
    words = str(x).split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)[:512]

df['text_body'] = df['text_body'].apply(clear_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

tqdm.pandas()
df['sent_finbert'] = df['text_body'].progress_apply(lambda x: finbert(x)[0])
df['sent_label']  = df['sent_finbert'].apply(lambda x: x['label'].lower())
df['sent_score']  = df['sent_finbert'].apply(lambda x: x['score'])

sent_map = {"neutral":0, "positive":1, "negative":-1}

df['sent_numeric'] = df['sent_label'].map(sent_map)
df['sent_weight']  = df['sent_numeric'] * df['sent_score']


Device set to use cpu
100%|██████████| 432/432 [01:54<00:00,  3.77it/s]


In [12]:
daily = (
    df.groupby('Date')['sent_weight']
    .mean()
    .reset_index()
    .rename(columns={'sent_weight':'daily_sent'})
)

daily.head()


Unnamed: 0,Date,daily_sent
0,2008-06-29,0.0
1,2008-07-02,0.999997
2,2008-07-28,0.0
3,2008-10-01,0.0
4,2008-12-05,0.0


In [21]:
import yfinance as yf

nifty = yf.download("^NSEI", start="2010-01-01")
nifty = nifty.reset_index()

nifty = nifty[[('Date', ''), ('Close', '^NSEI')]]
nifty.columns = ['Date', 'Close']
nifty['Date'] = nifty['Date'].dt.date
nifty.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Close
0,2010-01-04,5232.200195
1,2010-01-05,5277.899902
2,2010-01-06,5281.799805
3,2010-01-07,5263.100098
4,2010-01-08,5244.75


In [23]:
df_merged = pd.merge(nifty, daily, on="Date", how="left")
df_merged['daily_sent'] = df_merged['daily_sent'].fillna(0)

df_merged.head()

Unnamed: 0,Date,Close,daily_sent
0,2010-01-04,5232.200195,0.000000
1,2010-01-05,5277.899902,0.000000
2,2010-01-06,5281.799805,0.000000
3,2010-01-07,5263.100098,0.000000
4,2010-01-08,5244.750000,0.000000
...,...,...,...
3887,2025-10-31,25722.099609,0.407869
3888,2025-11-03,25763.349609,0.000000
3889,2025-11-04,25597.650391,0.000000
3890,2025-11-06,25509.699219,0.000000


In [25]:
df_merged['Return_1d'] = df_merged['Close'].pct_change().shift(-1)

def label_signal(r):
    if r > 0:
        return 1
    elif r < 0:
        return -1
    else:
        return 0

df_merged['signal'] = df_merged['Return_1d'].apply(label_signal)

df_merged.dropna(inplace=True)
df_merged.head()


Unnamed: 0,Date,Close,daily_sent,Return_1d,signal
0,2010-01-04,5232.200195,0.0,0.008734,1
1,2010-01-05,5277.899902,0.0,0.000739,1
2,2010-01-06,5281.799805,0.0,-0.00354,-1
3,2010-01-07,5263.100098,0.0,-0.003487,-1
4,2010-01-08,5244.75,0.0,0.000887,1


In [26]:
features = ['daily_sent']
X = df_merged[features]
y = df_merged['signal']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


### **Logistic Regression Model**

In [27]:
logr = LogisticRegression(max_iter=200)
logr.fit(X_train_scaled, y_train)

pred_lr = logr.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))


Logistic Regression Accuracy: 0.5449871465295629
              precision    recall  f1-score   support

          -1       0.60      0.01      0.02       354
           0       0.00      0.00      0.00         1
           1       0.54      1.00      0.70       423

    accuracy                           0.54       778
   macro avg       0.38      0.33      0.24       778
weighted avg       0.57      0.54      0.39       778



### **Random Forest Classifier**

In [28]:
rf = RandomForestClassifier(n_estimators=300, max_depth=5)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
print("RF Accuracy:", accuracy_score(y_test, pred_rf))
print(classification_report(y_test, pred_rf))


RF Accuracy: 0.5424164524421594
              precision    recall  f1-score   support

          -1       0.43      0.01      0.02       354
           0       0.00      0.00      0.00         1
           1       0.54      0.99      0.70       423

    accuracy                           0.54       778
   macro avg       0.32      0.33      0.24       778
weighted avg       0.49      0.54      0.39       778



### **XGBoost Classifier**

In [30]:
label_mapping = {-1: 0, 0: 1, 1: 2}
y_train_mapped = y_train.map(label_mapping)
y_test_mapped = y_test.map(label_mapping)

xgb_clf = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=3,
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05
)

xgb_clf.fit(X_train, y_train_mapped)
pred_xgb = xgb_clf.predict(X_test)

print("XGB Accuracy:", accuracy_score(y_test_mapped, pred_xgb))
print(classification_report(y_test_mapped, pred_xgb))

XGB Accuracy: 0.5437017994858612
              precision    recall  f1-score   support

           0       0.50      0.00      0.01       354
           1       0.00      0.00      0.00         1
           2       0.54      1.00      0.70       423

    accuracy                           0.54       778
   macro avg       0.35      0.33      0.24       778
weighted avg       0.52      0.54      0.39       778



### **NN - MLPClassifier**

In [31]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=400)
mlp.fit(X_train_scaled, y_train)

pred_mlp = mlp.predict(X_test_scaled)
print("MLP Accuracy:", accuracy_score(y_test, pred_mlp))
print(classification_report(y_test, pred_mlp))


MLP Accuracy: 0.5449871465295629
              precision    recall  f1-score   support

          -1       0.57      0.01      0.02       354
           0       0.00      0.00      0.00         1
           1       0.54      0.99      0.70       423

    accuracy                           0.54       778
   macro avg       0.37      0.33      0.24       778
weighted avg       0.56      0.54      0.39       778



### **Gradient Boosting Classifier**

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

pred_gbc = gbc.predict(X_test)
print("GBC Accuracy:", accuracy_score(y_test, pred_gbc))
print(classification_report(y_test, pred_gbc))


GBC Accuracy: 0.5424164524421594
              precision    recall  f1-score   support

          -1       0.43      0.01      0.02       354
           0       0.00      0.00      0.00         1
           1       0.54      0.99      0.70       423

    accuracy                           0.54       778
   macro avg       0.32      0.33      0.24       778
weighted avg       0.49      0.54      0.39       778



### **SVC**

In [33]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="rbf", C=1.0)
svm_clf.fit(X_train_scaled, y_train)

pred_svm = svm_clf.predict(X_test_scaled)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


SVM Accuracy: 0.5424164524421594
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       354
           0       0.00      0.00      0.00         1
           1       0.54      1.00      0.70       423

    accuracy                           0.54       778
   macro avg       0.18      0.33      0.23       778
weighted avg       0.30      0.54      0.38       778

