# Market Sentiment Prediction Model

In [None]:
!pip install praw openai pandas asyncpraw textblob pandas

In [None]:
!pip install transformers torch

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load FinBERT tokenizer and model
MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)


In [None]:
import asyncpraw
import pandas as pd
from datetime import datetime, timedelta
import re
import asyncio
import nest_asyncio
from transformers import pipeline, AutoTokenizer

nest_asyncio.apply()

# Configurable time period for fetching posts (in days)
DAYS_BACK = 90  # Change this value to adjust the time range

# Minimum engagement thresholds
MIN_UPVOTES = 100
MIN_COMMENTS = 10
MIN_COMMENT_UPVOTES = 20
MIN_COMMENT_LENGTH = 30

# Initialize asyncpraw client
reddit = asyncpraw.Reddit(
    client_id="Enter_You_ID",
    client_secret="Enter_Your_Secret",
    user_agent="StockScraper"
)

# Relevant Stock Market Subreddits
subreddits = ["IndianStockMarket", "DalalStreetTalks", "StockMarketIndia", "IndianStreetBets", "NSEBets", "ShareMarketupdates"]

# Keywords to filter out low-effort posts
low_effort_keywords = ["meme", "joke", "funny", "shitpost", "lol", "haha", "troll"]

# Define time threshold
days_ago = datetime.utcnow() - timedelta(days=DAYS_BACK)

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    return text

# Fetch and process Reddit posts
async def fetch_reddit_data():
    data = []
    for subreddit in subreddits:
        subreddit_obj = await reddit.subreddit(subreddit)
        async for post in subreddit_obj.top(time_filter='all', limit=500):  # Fetch more posts and filter manually

            # Extract Post Details
            post_id = post.id
            title = clean_text(post.title)
            body = clean_text(post.selftext)
            upvotes = post.score
            post_date_utc = datetime.utcfromtimestamp(post.created_utc)
            post_date = post_date_utc.strftime('%Y-%m-%d %H:%M:%S')

            # Skip posts older than the configured time period
            if post_date_utc < days_ago:
                continue

            # Apply filters to remove low-quality posts
            if any(word in title.lower() for word in low_effort_keywords):
                continue
            if len(title) < 30 and not body:  # Skip very short titles with no content
                continue
            if upvotes < MIN_UPVOTES:  # Skip low-engagement posts
                continue

            # Fetch & Store Top Comments
            comments_list = []
            try:
                submission = await reddit.submission(id=post.id)
                if submission.num_comments >= MIN_COMMENTS and not submission.locked:  # Ensure sufficient comments & post isn't locked
                    await submission.comments.replace_more(limit=0)  # Fetch all comments
                    all_comments = submission.comments.list()  # Get all comments

                    # Filter valid comments
                    valid_comments = [
                        c for c in all_comments
                        if hasattr(c, "body") and c.body not in ["[deleted]", "[removed]"]
                        and c.score >= MIN_COMMENT_UPVOTES and len(c.body) >= MIN_COMMENT_LENGTH
                    ]

                    if valid_comments:
                        comments_list = [{
                            "text": clean_text(c.body),
                            "upvotes": c.score
                        } for c in valid_comments]

            except Exception:
                pass

            # Skip if there are no valid comments
            if not comments_list:
                continue

            # Store Processed Data
            data.append({
                "Post ID": post_id,
                "Title": title,
                "Post Content": body,
                "Post Upvotes": upvotes,
                "Comments": comments_list,
                "Date": post_date,
                "Source Subreddit": subreddit
            })

    return data

# Run the function and clean data
async def main():
    data = await fetch_reddit_data()

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Save scraped data to JSON format
    df.to_json("reddit_stock_data_raw.json", orient="records", indent=4)
    print("✅ Scraped data saved successfully!")

# Execute async function
await main()

from google.colab import files
files.download("reddit_stock_data_raw.json")

ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7cbcd85b3f10>


✅ Scraped data saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Part 2: Sentiment Analysis
from transformers import AutoTokenizer, pipeline # Import AutoTokenizer
import pandas as pd # Import pandas and alias it as 'pd'

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert")

# Function to get sentiment scores
def analyze_sentiment(text):
    tokens = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    truncated_text = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
    result = sentiment_analyzer(truncated_text, return_all_scores=True)[0]
    sentiment_scores = {sent["label"].lower(): sent["score"] for sent in result}
    overall_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    confidence = sentiment_scores[overall_sentiment]

    return {
        "overall_sentiment": overall_sentiment,
        "confidence": confidence
    }

# Load scraped data
df = pd.read_json("reddit_stock_data_raw.json")

# Apply sentiment analysis
df["Post Sentiment"] = df.apply(lambda row: analyze_sentiment(row["Title"] + " " + row["Post Content"]), axis=1)
for index, row in df.iterrows():
    for comment in row["Comments"]:
        comment.update(analyze_sentiment(comment["text"]))

# Save processed data
df.to_json("reddit_stock_data_sentiments.json", orient="records", indent=4)
print("✅ Sentiment analysis completed and saved successfully!")

from google.colab import files
files.download("reddit_stock_data_sentiments.json")

Device set to use cuda:0


✅ Sentiment analysis completed and saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import pipeline

# Load Zero-Shot Classification Model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define possible categories
CATEGORY_LABELS = [
    "Market News",
    "Stock-Specific Discussion",
    "Investment Idea",
    "Trading Strategy",
    "General Discussion",
    "Regulatory News",
    "Macroeconomy",
    "Technical Analysis",
    "Fundamental Analysis"
]

# Function to classify posts
def classify_post(title, content, comments):
    # Combine post title, content, and top comments for classification
    combined_text = title + " " + content + " " + " ".join([c["text"] for c in comments])

    # Run zero-shot classification
    result = classifier(combined_text, CATEGORY_LABELS, multi_label=False)

    # Get the highest confidence category
    category = result["labels"][0]

    return category

# Load processed data with sentiment
df = pd.read_json("reddit_stock_data_sentiments.json")

# Assign category to each post
df["Category"] = df.apply(lambda row: classify_post(row["Title"], row["Post Content"], row["Comments"]), axis=1)

# Save categorized data
df.to_json("reddit_stock_data_category.json", orient="records", indent=4)
print("✅ Categorization completed and saved successfully!")

from google.colab import files
files.download("reddit_stock_data_category.json")

Device set to use cuda:0


✅ Categorization completed and saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Merging Nifty50 Data

import yfinance as yf
import pandas as pd

# Fetch Nifty 50 historical data
def fetch_nifty50_data(days=365):
    nifty = yf.Ticker("^NSEI")  # Nifty 50 index
    nifty_data = nifty.history(period=f"{days}d")  # Get daily data for given period
    nifty_data = nifty_data.reset_index()[["Date", "Open", "High", "Low", "Close", "Volume"]]
    nifty_data["Date"] = nifty_data["Date"].dt.strftime('%Y-%m-%d')  # Format date
    return nifty_data

# Load Reddit sentiment data
reddit_data = pd.read_json("reddit_stock_data_category.json")

# Convert Reddit Date column to string (if it's not already)
reddit_data["Date"] = pd.to_datetime(reddit_data["Date"]).dt.strftime('%Y-%m-%d')

# Fetch market data
nifty_data = fetch_nifty50_data(365)

# Merge sentiment with market data based on date
merged_data = pd.merge(reddit_data, nifty_data, on="Date", how="inner")

# Save merged dataset
merged_data.to_json("reddit_nifty_combined.json", orient="records", indent=4)

print("✅ Nifty 50 data merged with Reddit sentiment!")

from google.colab import files
files.download("reddit_nifty_combined.json")


✅ Nifty 50 data merged with Reddit sentiment!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset (merged Reddit + Nifty data)
data = pd.read_json("reddit_nifty_combined.json")

# Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Calculate Nifty % return
data["Nifty Return"] = data["Close"].pct_change() * 100

# Drop first row (NaN due to pct_change)
data = data.dropna()

### 🎯 Step 1: Compute Total Positive, Negative & Neutral Sentiments 🎯 ###
def calculate_sentiment_totals(row):
    total_positive = sum(comment["upvotes"] for comment in row["Comments"] if comment["overall_sentiment"] == "positive")
    total_negative = sum(comment["upvotes"] for comment in row["Comments"] if comment["overall_sentiment"] == "negative")
    total_neutral = sum(comment["upvotes"] for comment in row["Comments"] if comment["overall_sentiment"] == "neutral")

    return pd.Series([total_positive, total_negative, total_neutral])

# Apply sentiment calculations
data[["Total Positive Sentiment", "Total Negative Sentiment", "Total Neutral Sentiment"]] = data.apply(calculate_sentiment_totals, axis=1)

# Aggregate daily sentiment scores
daily_sentiment = data.groupby("Date")[["Total Positive Sentiment", "Total Negative Sentiment", "Total Neutral Sentiment"]].sum().reset_index()

# Merge with Nifty data
final_data = pd.merge(daily_sentiment, data[["Date", "Nifty Return"]].drop_duplicates(), on="Date", how="inner")

# Shift Nifty Return to T+1 (to make it the prediction target)
final_data["Target Return"] = final_data["Nifty Return"].shift(-1)

# Drop last row (since it has NaN target)
final_data = final_data.dropna()

# Save prepared dataset
final_data.to_csv("nifty_sentiment_data.csv", index=False)
print("✅ Data prepared for ML model!")


### 📊 Step 2: Train XGBoost Model 📊 ###
# Load the processed dataset
df = pd.read_csv("nifty_sentiment_data.csv")

# Features & target
X = df[["Total Positive Sentiment", "Total Negative Sentiment", "Total Neutral Sentiment"]]
y = df["Target Return"]

# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train XGBoost model
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, max_depth=4)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📉 Mean Absolute Error (MAE): {mae:.4f}")
print(f"📈 R² Score: {r2:.4f}")
print("✅ XGBoost Model trained & saved!")


# Reddit Discussion Summary

In [None]:
''''
from transformers import pipeline
import pandas as pd
import json
import textwrap

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load the scraped Reddit data
with open("reddit_data_raw.json", "r") as f:
    reddit_data = json.load(f)

# Combine all posts and comments into one large text block
all_text = " ".join(
    post["Title"] + " " + post["Post Content"] + " " +
    " ".join(comment["text"] for comment in post["Comments"])
    for post in reddit_data
)

# Break text into smaller chunks (each ~1000 characters to fit model limits)
text_chunks = textwrap.wrap(all_text, width=1000)

# Generate summaries for each chunk and combine
summaries = [summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]["summary_text"] for chunk in text_chunks]

# Final combined summary
overall_summary = " ".join(summaries)
print(overall_summary)


In [None]:
!pip install praw pandas asyncpraw textblob pandas

In [3]:
import asyncpraw
import pandas as pd
from datetime import datetime, timedelta
import re
import asyncio
import nest_asyncio
from transformers import pipeline, AutoTokenizer

nest_asyncio.apply()

# Configurable time period for fetching posts (in days)
DAYS_BACK = 120  # Change this value to adjust the time range

# Minimum engagement thresholds
Num_Posts = 20 #Number of Posts to be scraped - Lower number since using paid version
MIN_UPVOTES = 100
MIN_COMMENTS = 10
MIN_COMMENT_UPVOTES = 20
MIN_COMMENT_LENGTH = 30

# Initialize asyncpraw client
reddit = asyncpraw.Reddit(
    client_id="Enter_Your_Client_ID",
    client_secret="Enter_Your_Secret_Key",
    user_agent="StockScraper"
)

# Relevant Stock Market Subreddits
subreddits = ["IndianStockMarket", "DalalStreetTalks", "StockMarketIndia", "IndianStreetBets", "NSEBets", "ShareMarketupdates"]

# Keywords to filter out low-effort posts
low_effort_keywords = ["meme", "joke", "funny", "shitpost", "lol", "haha", "troll"]

# Define time threshold
days_ago = datetime.utcnow() - timedelta(days=DAYS_BACK)

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    return text

# Fetch and process Reddit posts
async def fetch_reddit_data():
    data = []
    for subreddit in subreddits:
        subreddit_obj = await reddit.subreddit(subreddit)
        async for post in subreddit_obj.top(time_filter='all', limit= Num_Posts):  # Fetch more posts and filter manually

            # Extract Post Details
            post_id = post.id
            title = clean_text(post.title)
            body = clean_text(post.selftext)
            upvotes = post.score
            post_date_utc = datetime.utcfromtimestamp(post.created_utc)
            post_date = post_date_utc.strftime('%Y-%m-%d %H:%M:%S')

            # Skip posts older than the configured time period
            if post_date_utc < days_ago:
                continue

            # Apply filters to remove low-quality posts
            if any(word in title.lower() for word in low_effort_keywords):
                continue
            if len(title) < 30 and not body:  # Skip very short titles with no content
                continue
            if upvotes < MIN_UPVOTES:  # Skip low-engagement posts
                continue

            # Fetch & Store Top Comments
            comments_list = []
            try:
                submission = await reddit.submission(id=post.id)
                if submission.num_comments >= MIN_COMMENTS and not submission.locked:  # Ensure sufficient comments & post isn't locked
                    await submission.comments.replace_more(limit=0)  # Fetch all comments
                    all_comments = submission.comments.list()  # Get all comments

                    # Filter valid comments
                    valid_comments = [
                        c for c in all_comments
                        if hasattr(c, "body") and c.body not in ["[deleted]", "[removed]"]
                        and c.score >= MIN_COMMENT_UPVOTES and len(c.body) >= MIN_COMMENT_LENGTH
                    ]

                    if valid_comments:
                        comments_list = [{
                            "text": clean_text(c.body),
                            "upvotes": c.score
                        } for c in valid_comments]

            except Exception:
                pass

            # Skip if there are no valid comments
            if not comments_list:
                continue

            # Store Processed Data
            data.append({
                "Post ID": post_id,
                "Title": title,
                "Post Content": body,
                "Post Upvotes": upvotes,
                "Comments": comments_list,
                "Date": post_date,
                "Source Subreddit": subreddit
            })

    return data

# Run the function and clean data
async def main():
    data = await fetch_reddit_data()

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Save scraped data to JSON format
    df.to_json("reddit_data_raw.json", orient="records", indent=4)


# Execute async function
await main()

In [4]:
import json
import google.generativeai as genai

# Load the JSON file containing Reddit stock discussions
with open("reddit_data_raw.json", "r") as f:
    reddit_data = json.load(f)

# Extract relevant text from posts and comments
all_text = "\n".join(
    post["Title"] + " " + post["Post Content"] + " " +
    " ".join(comment["text"] for comment in post["Comments"])
    for post in reddit_data
)

# Configure Gemini API (Replace with your own API key)
genai.configure(api_key="Enter_You_API_Key")

# Define the improved prompt for Gemini Flash
prompt = f"""
Analyze the following stock market discussions from Reddit and generate a concise and insightful summary.

**Overall Sentiment:**
- What is the general mood of investors? (e.g., optimism, fear, uncertainty)
- Are there concerns about government policies, economic trends, or specific sectors?

**Key Themes:**
- Identify the top discussions based on engagement (high upvotes and comments).
- Highlight the most talked-about economic trends, policies, market corrections, or investor concerns.

**Hot Stocks:**
- List stocks that were frequently mentioned.
- Explain the context in which these stocks were discussed (bullish/bearish sentiment, earnings, news impact).

**Actionable Insights for Investors & Traders:**
- What opportunities or risks should traders and investors watch for?
- Any recommendations based on sentiment, stock mentions, and market themes?

**Reddit Stock Discussion Data:**
{all_text}
"""

# Call Gemini Flash (Free model)
model = genai.GenerativeModel("gemini-pro")
response = model.generate_content(prompt)

# Print the summary
print(response.text)

**Overall Sentiment:**

- **Optimism:**
    - **Mixed sentiment**, with users expressing concern about market volatility and correction.
    - **Some optimism** regarding the long-term growth potential of the Indian economy.
- **Fear:**
    - **Concerns** regarding government policies, particularly tax burdens on the middle class.
    - **Unease** about the impact of global economic trends on the Indian market.
- **Uncertainty:**
    - **Uncertain** about the future trajectory of the market and the impact of government initiatives.

**Key Themes:**

- **Tax Revolt Discussion:**
    - Extensive discussion on the Nationwide Taxpayers' Revolt, highlighting concerns about excessive taxation.
    - Suggestions for alternative revenue generation measures and curbs on government spending.
- **Impact of Trump's Speech:**
    - Mixed reactions to Trump's speech, with some users expressing concern about tariffs on Indian exports.
    - Others highlighting the potential benefits of reduced compet