In [240]:
pip install praw

Note: you may need to restart the kernel to use updated packages.


In [241]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [242]:
import praw
import yfinance as yf
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import joblib
from datetime import datetime, timedelta, timezone

### Scraping Data from Reddit

In [244]:
def scrape_reddit(subreddit_name, limit=100):
    reddit = praw.Reddit(client_id="t6itdwpgPsWhvgV2t6rbLA",
                        client_secret = "KkvnosUw6WRaMC242BcqCki1-O2DjA",
                        user_agent="StockSentimentAnalysisScript/0.1 by PrizeCustard2561")
    subreddit = reddit.subreddit(subreddit_name)
    posts=[]
    for submission in subreddit.new(limit=limit):
        posts.append({
            "title": submission.title,
            "content": submission.selftext,
            "upvotes":submission.score,
            "comments":submission.num_comments,
            "created_utc":submission.created_utc
        })
    return pd.DataFrame(posts)

### Fetch stock Data

In [246]:
def fetch_stock_data(ticker, start_date, end_date):
    # Download stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    
    # Reset the index to flatten the MultiIndex
    stock_data.reset_index(inplace=True)
    stock_data.columns = stock_data.columns.get_level_values(0)  # Flatten MultiIndex

    # Add derived columns
    stock_data['Price_Change'] = stock_data['Close'].diff().shift(-1)  # Price change for next day
    stock_data['Movement'] = (stock_data['Price_Change'] > 0).astype(int)  # Binary movement
    
    # Ensure 'Date' column matches format
    stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date
    
    # Select only relevant columns
    return stock_data[['Date', 'Close', 'Movement']]


### Align Reddit Posts with stock data

In [248]:

def align_data_with_stock(reddit_data, stock_data):
    reddit_data['Date'] = pd.to_datetime(reddit_data['created_utc'], unit='s').dt.date

    # Ensure column types match for merging
    stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date
    reddit_data['Date'] = pd.to_datetime(reddit_data['Date']).dt.date

    print("Reddit Data Columns:", reddit_data.columns)
    print("Stock Data Columns:", stock_data.columns)

    # Merge on 'Date'
    merged_data = pd.merge(reddit_data, stock_data, on='Date', how='inner')
    return merged_data


### Clean and prepocess data

In [250]:

def preprocess_data_with_stock(df, ticker_list):
    df['content'] = df['content'].fillna("")  # Handle missing content
    df['text'] = df['title'] + " " + df['content']  # Combine title and content
    # df['text'] = df['text'].apply(clean_text)  # Clean text
    df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)  # Calculate sentiment
    df['stock_mentions'] = df['text'].apply(lambda x: len([ticker for ticker in ticker_list if ticker in x])) #Extract stock mentions
    df['hour'] = pd.to_datetime(df['created_utc'], unit='s').dt.hour
    # df['stock_movement'] = (df['sentiment'] > 0).astype(int)  # Mock target: Positive sentiment -> Increase (1)
    df['is_trading_hours'] = df['hour'].between(9, 16).astype(int)
    # Final DataFrame with features and real stock movement
    return df[['text', 'sentiment', 'upvotes', 'comments', 'stock_mentions', 'is_trading_hours', 'Movement']]

### Train Machine learning Model

In [252]:
def train_model(df):
    features = ['sentiment', 'upvotes', 'comments', 'stock_mentions', 'is_trading_hours']
    X=df[features] #feature: sentiment
    y=df['Movement'] # Target: Stock movement (0 or 1)

    # spliting in train and test sets
    X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #Train Random Forest Classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # make Predictions
    y_pred = model.predict(X_test)

    #Evaluate Model
    print("\nModel Evaluation Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    # Handle Undefined Metric Warning with zero_division=0
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)  
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("\nClassification Report: ")
    print(classification_report(y_test, y_pred, zero_division=0))

    return model

### Save the Model

In [254]:
def save_model(model, filename="stock_movement_model_1.pkl"):
    joblib.dump(model,filename)

### Exection

In [278]:
if __name__== "__main__":
    # Define stock ticker list (example)
    ticker_list = ["AAPL", "TSLA", "AMZN", "GOOGL", "MSFT"]
    #Scrape data from Reddit
    print("Scraping data...")
    reddit_data = scrape_reddit("investing", limit=100)

    #Fetch stock data
    print("Fetching stock data...")
    start_date = (datetime.now(timezone.utc) - timedelta(days=30)).strftime('%Y-%m-%d')
    end_date = datetime.now(timezone.utc).strftime('%Y-%m-%d')
    stock_data = fetch_stock_data("TSLA", start_date, end_date)
    print(stock_data.head())
    print(stock_data.columns)

    #Align Reddit Data with stock data
    print('Aligning data...')
    aligned_data = align_data_with_stock(reddit_data, stock_data)
    
    # Check aligned data sample
    print("Aligned data sample:")
    print(aligned_data.head())

    # Preprocess data
    print("Preprocessing data...")
    processed_data = preprocess_data_with_stock(aligned_data, ticker_list)

    #Train the model
    print("Trainig model...")
    model = train_model(processed_data)

    #save the model
    print("Saving model...")
    save_model(model)

    print("Process completed successfully!")

Scraping data...


[*********************100%***********************]  1 of 1 completed

Fetching stock data...
Price        Date       Close  Movement
0      2024-11-06  288.529999         1
1      2024-11-07  296.910004         1
2      2024-11-08  321.220001         1
3      2024-11-11  350.000000         0
4      2024-11-12  328.489990         1
Index(['Date', 'Close', 'Movement'], dtype='object', name='Price')
Aligning data...
Reddit Data Columns: Index(['title', 'content', 'upvotes', 'comments', 'created_utc', 'Date'], dtype='object')
Stock Data Columns: Index(['Date', 'Close', 'Movement'], dtype='object', name='Price')
Aligned data sample:
                                               title  \
0  Looking for opinions regarding investment into...   
1  Does My Allocation Make Sense? Or Too Much Red...   
2  What is your thought on my portfolio and strat...   
3  Why is saving for retirement as a business own...   
4                    Financial adviser being a dick    

                                             content  upvotes  comments  \
0  With GTA6 on the wa





Model Evaluation Metrics:
Accuracy: 0.71
Precision: 0.00
Recall: 0.00
F1 Score: 0.00

Classification Report: 
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        10
           1       0.00      0.00      0.00         4

    accuracy                           0.71        14
   macro avg       0.36      0.50      0.42        14
weighted avg       0.51      0.71      0.60        14

Saving model...
Process completed successfully!
