In [191]:
# import useful libraries
import tweepy
import pandas as pd
import re
import nltk
import requests
import praw
import pytz
import asyncio
import nest_asyncio
import yfinance as yf
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from collections import Counter
from telethon.sync import TelegramClient
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score

In [5]:
# Reddit API credentials
reddit = praw.Reddit(client_id='xxxxxxxxxxxx',
                     client_secret='xxxxxxxx',
                     user_agent='xxxxx')

In [123]:
## function to scrape posts from reddit
def scrape_reddit_data(subreddits, max_posts=10000, batch_size=100):
    posts = []
    for subreddit in subreddits:
        try:
            subreddit_obj = reddit.subreddit(subreddit)
            after = None  # Used for pagination
            total_fetched = 0
            while total_fetched < max_posts:
                batch_posts = subreddit_obj.new(limit=batch_size, params={"after": after})
                fetched = 0  # Count posts in this batch
                for submission in batch_posts:
                    posts.append({
                        'date': pd.to_datetime(submission.created_utc, unit='s'),
                        'title': submission.title,
                        'text': submission.selftext,
                        'subreddit': subreddit
                    })
                    after = submission.name  # Get the 'fullname' of the last submission
                    fetched += 1
                    total_fetched += 1
                    if total_fetched >= max_posts:
                        break
                if fetched == 0:  # Break if no new posts were fetched (end of subreddit)
                    break
        except Exception as e:
            print(f"Error with subreddit '{subreddit}': {e}")
    return pd.DataFrame(posts)


subreddits = ["stocks", "investing", "WallStreetBets", "finance", "personalfinance", "Apple"]
reddit_data = scrape_reddit_data(subreddits, max_posts=10000, batch_size=100)

In [124]:
reddit_data

Unnamed: 0,date,title,text,subreddit
0,2024-12-08 06:15:37,Any Top Software Names (ex Mag7)?,Looking for good software names in the AI trad...,stocks
1,2024-12-08 02:57:26,Best strategy for investing in SPY:,I decided to explore this by testing three dif...,stocks
2,2024-12-08 01:41:23,Help understanding JBT merger,I recently inherited a retirement brokerage ac...,stocks
3,2024-12-08 00:51:36,SMCI Recovery and loose DD,SMCI golden age and beautiful chart. \n\nAfter...,stocks
4,2024-12-07 23:54:31,Is it true most people lost money when they st...,"So I am 27, and I have a good job, make 200k i...",stocks
...,...,...,...,...
4347,2024-09-12 01:05:26,The Mouse iPhone 16 line up!,,Apple
4348,2024-09-11 23:45:42,iPhone 16 Pro offers up to 26% faster 5G downl...,,Apple
4349,2024-09-11 23:35:34,Apple's Polishing Cloth now supports new iPhon...,,Apple
4350,2024-09-11 22:22:49,AirPods 4 Have Hidden Capacitive Button for Pa...,,Apple


In [125]:
## convert reddit_data into a csv file
reddit_data.to_csv("reddit_file.csv", index=False)

In [126]:
# Apply nest_asyncio to allow asyncio.run()
nest_asyncio.apply()

In [127]:
# Telegram API credentials
api_id = 'xxxxx'
api_hash = 'xxxxxxxxx'

client = TelegramClient('My.session', api_id, api_hash)

In [128]:
# Define  timezone
tz = pytz.UTC

# Function to scrape data from a single channel
async def scrape_single_channel(client, channel, max_messages=None):
    messages = []
    count = 0

    async for message in client.iter_messages(channel, reverse=True):
        # Ensure the message has a date and text content
        if message and message.message:
            messages.append({
                'date': message.date.astimezone(tz),  # Convert to timezone-aware
                'text': message.message,
                'channel': channel
            })
            count += 1
            if max_messages and count >= max_messages:
                break  # Stop if max_messages limit is reached
    
    return messages

# Function to scrape data from multiple channels
async def scrape_telegram_data(channels, max_messages_per_channel=None):
    # Ensure client is connected
    async with TelegramClient('session_name', api_id, api_hash) as client:
        all_messages = []
        for channel in channels:
            try:
                print(f"Scraping channel: {channel}")
                channel_messages = await scrape_single_channel(client, channel, max_messages=max_messages_per_channel)
                all_messages.extend(channel_messages)
                await asyncio.sleep(1) 
                
            except Exception as e:
                print(f"Error scraping channel {channel}: {e}")
        return pd.DataFrame(all_messages)


In [130]:
## famous telegram channels to extract stocks data
channels = [
    '@Share_Trading_Tips_Stock_Market',
    '@Stock_market_free_tips_channel',
    '@iopkng',
    '@Stock_market_free_tipss_channel',
    '@StockMarket_ShareMarket_Channel'
]
max_messages_per_channel = 2000  # Limit messages per channel

# Run the coroutine
telegram_data = asyncio.run(scrape_telegram_data(channels, max_messages_per_channel))

Please enter your phone (or bot token):  +917599447440
Please enter the code you received:  89816


Signed in successfully as Suhail Malik; remember to not break the ToS or you will risk an account ban!
Scraping channel: @Share_Trading_Tips_Stock_Market
Scraping channel: @Stock_market_free_tips_channel
Scraping channel: @iopkng
Scraping channel: @Stock_market_free_tipss_channel
Scraping channel: @StockMarket_ShareMarket_Channel


In [131]:
## print telegram_data
telegram_data

Unnamed: 0,date,text,channel
0,2024-08-25 08:57:19+00:00,## A Beginner's Guide to the Stock Market\n\nT...,@Share_Trading_Tips_Stock_Market
1,2024-08-25 08:57:20+00:00,The Stock Market: A Deeper Dive\n\nWhile the b...,@Share_Trading_Tips_Stock_Market
2,2024-08-25 08:57:20+00:00,## A Beginner's Guide to the Stock Market\n\nT...,@Share_Trading_Tips_Stock_Market
3,2024-08-25 08:57:21+00:00,The Stock Market: A Deeper Dive\n\nWhile the b...,@Share_Trading_Tips_Stock_Market
4,2024-08-25 08:57:22+00:00,The Stock Market: A Deeper Dive\n\nWhile the b...,@Share_Trading_Tips_Stock_Market
...,...,...,...
3835,2024-07-31 09:28:21+00:00,Done for the day,@StockMarket_ShareMarket_Channel
3836,2024-08-03 10:38:53+00:00,Open intrest Kya He...!!,@StockMarket_ShareMarket_Channel
3837,2024-08-03 11:57:56+00:00,Dosto Monday Asian Pant Me najar Rakhe... Brea...,@StockMarket_ShareMarket_Channel
3838,2024-09-17 06:46:21+00:00,Long term ke le liya he,@StockMarket_ShareMarket_Channel


In [132]:
## conversion of telegram data into a csv file
telegram_data.to_csv("telegram_file.csv", index=False)

In [29]:
# Twitter API Bearer Token
BEARER_TOKEN = "XXXXXXXXXXXXXXXXXX"

def create_headers(token):
    """
    Create headers for API authentication.
    """
    return {"Authorization": f"Bearer {token}"}

def fetch_tweets(query, max_results):
    """
    Fetch recent tweets using the Basic Plan.
    """
    url = "https://api.twitter.com/2/tweets/search/recent"
    headers = create_headers(BEARER_TOKEN)
    params = {
        "query": query,            #  search term
        "max_results": max_results,  # Max tweets 
        "tweet.fields": "created_at,text,author_id"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json()
    elif response.status_code == 403:
        print("Access forbidden. Check your plan or endpoint permissions.")
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None
   

In [30]:
search_term = "stock market OR #stocks OR #investing"
tweets = fetch_tweets(search_term, max_results=50)
for tweet in tweets["data"]:
    print(f"{tweet['created_at']}: {tweet['text']}")

2024-12-06T10:18:38.000Z: RT @supplyshocks: Only 1/6 to 1/11 of $MSTR income comes from the direction of #Bitcoin.

The market as a whole has it all wrong (for now)…
2024-12-06T10:18:30.000Z: 🤣🤣🤣#Crypto #cryptocurrency #Blockchain #web3 #web3news #web3crypto #exchange #BTC📷📷📷#XRP #doge #sol #ETF #NFT #news #bitcoin📷📷📷📷#Tether #exchange #investingtips #tradervibe #trader #investing #cryptoworld https://t.co/SIQTgyJg0R
2024-12-06T10:18:25.000Z: Jobs Report Today: November Payrolls Data Due https://t.co/Zvu3y2iunS
2024-12-06T10:18:18.000Z: Ma Shaa Allah 👏🏻 Proud of You Nasir Bro my Member....Winners act, while thinkers take a thousand years to decide😁🌟👍🏻#earn #earnmoney #earnmoneyfromhome #earnit #earnings #earnfromhome #business #money #investment #workfromhome #investing #onlinebusiness #affiliatemarketing https://t.co/XpL3trEYn0
2024-12-06T10:18:15.000Z: RT @Maaachaaa69: CEO of RARE Ent. ( Rakesh Jhunjhunwala firm ) &amp; a veteran of Indian stock market who is able to tell us about ne

In [33]:
tweets = [{'date': item['created_at'], 'text': item['text']} for item in tweets['data']]
# Convert to DataFrame
twitter_data = pd.DataFrame(tweets)

In [137]:
twitter_data

Unnamed: 0,date,text,sentiment
0,2024-12-06,rt mstr income come direction market whole wrong,-0.15
1,2024-12-06,,0.0
2,2024-12-06,job report today november payroll data due,-0.125
3,2024-12-06,shaa allah proud nasir bro memberwinners act t...,0.8
4,2024-12-06,rt ceo rare ent rakesh jhunjhunwala firm amp v...,0.12
5,2024-12-06,rt stock market go time time far straight line...,0.183333
6,2024-12-06,rt spx need hurry flip stock market,0.0
7,2024-12-06,rt quote,0.0
8,2024-12-06,doge ev,0.0
9,2024-12-06,rt earn accumulate however register free grass...,0.45


In [34]:
twitter_data.to_csv("twitter_file.csv", index=False)

In [35]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\SUHAIL
[nltk_data]     MALIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\SUHAIL
[nltk_data]     MALIK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SUHAIL
[nltk_data]     MALIK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()

In [50]:
## function for cleaning the text
def clean_text(text):
    """
    Preprocess and clean a single tweet.
    """
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", '', text)
    # Remove special characters and numbers
    text = re.sub(r"[^A-Za-z\s]", '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Reconstruct cleaned text
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [51]:
## function for preprocessing the text
def preprocess_data(data):
    """
    Preprocess a list of tweets and return cleaned texts.
    """  
    data = data.loc[data['text'].notnull() & (data['text'] != '')]
    data = data.reset_index(drop=True)
      # Apply cleaning function row-wise
    data['text'] = data['text'].apply(clean_text)
     # Parse the date column to datetime
    data['date'] = pd.to_datetime(data['date'], errors='coerce')  # Handle invalid formats gracefully
    # Extract only the date part
    data['date'] = data['date'].dt.date 
    return data

In [138]:
## convert back the files into dataframe for further use 
telegram_data=pd.read_csv("telegram_file.csv")
twitter_data=pd.read_csv("twitter_file.csv")
reddit_data=pd.read_csv("reddit_file.csv")

In [139]:
reddit_data['text'] = reddit_data['title']+reddit_data['text']

In [140]:
reddit_data=reddit_data[["date","text"]]

In [141]:
telegram_data=telegram_data[["date","text"]]

In [142]:
# Apply cleaning to all datasets
telegram_data= preprocess_data(telegram_data)
twitter_data = preprocess_data(twitter_data)
reddit_data= preprocess_data(reddit_data)

In [146]:
## function to apply sentiments on data

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Add sentiment scores
twitter_data['sentiment'] = twitter_data['text'].apply(get_sentiment)
reddit_data['sentiment'] = reddit_data['text'].apply(get_sentiment)
telegram_data['sentiment'] = telegram_data['text'].apply(get_sentiment)


In [149]:
# Aggregate sentiment by date
twitter_agg = twitter_data.groupby(twitter_data['date'])['sentiment'].mean().reset_index()
reddit_agg = reddit_data.groupby(reddit_data['date'])['sentiment'].mean().reset_index()
telegram_agg = telegram_data.groupby(telegram_data['date'])['sentiment'].mean().reset_index()

In [150]:
# Ensure 'date' is the index for all DataFrames for alignment
twitter_agg.set_index('date', inplace=True)
reddit_agg.set_index('date', inplace=True)
telegram_agg.set_index('date', inplace=True)

In [151]:
# Perform a full outer join to combine all sentiment data
combined_sentiments = twitter_agg.add(reddit_agg, fill_value=0).add(telegram_agg, fill_value=0)

In [152]:
combined_sentiments = combined_sentiments.reset_index()

In [155]:
combined_sentiments['date'] = pd.to_datetime(combined_sentiments['date'], errors='coerce')

In [160]:
combined_sentiments

Unnamed: 0,date,sentiment
0,2021-11-13,0.125000
1,2021-11-14,0.138889
2,2021-11-15,0.064286
3,2021-11-26,0.233333
4,2021-11-30,0.000000
...,...,...
586,2024-12-04,0.355064
587,2024-12-05,0.490599
588,2024-12-06,0.190429
589,2024-12-07,0.097893


In [161]:
# Define multiple stock tickers
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA"]

# Fetch daily data for each ticker and concatenate
dataframes = []
for ticker in tickers:
    df = yf.download(ticker, start="2021-01-01", end="2024-12-01")
    df.reset_index(inplace=True)
    df['ticker'] = ticker  # Add a column for ticker
    dataframes.append(df)

# Combine all data
combined_data = pd.concat(dataframes, ignore_index=True)

# Merge based on dates
combined_data['date'] = pd.to_datetime(combined_data['Date']).dt.date


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [163]:
# Keep only relevant columns
combined_data = combined_data[['Date', 'ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]

# Handle missing data by pivoting
pivoted_data = combined_data.pivot_table(
    index='Date',
    columns='ticker',
    values=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
)

# Flatten the MultiIndex columns
pivoted_data.columns = [f"{col[1]}_{col[0]}" for col in pivoted_data.columns]

# Reset the index for a clean DataFrame
pivoted_data.reset_index(inplace=True)

# Fill missing values
pivoted_data.fillna(method='ffill', inplace=True)  # Forward fill
pivoted_data.fillna(method='bfill', inplace=True)  # Backward fill

# For overlapping dates, calculate averages 
pivoted_data = pivoted_data.groupby('Date').mean().reset_index()

  pivoted_data = combined_data.pivot_table(
  pivoted_data.fillna(method='ffill', inplace=True)  # Forward fill
  pivoted_data.fillna(method='bfill', inplace=True)  # Backward fill


In [164]:
pivoted_data

Unnamed: 0,Date,AAPL_Adj Close,AMZN_Adj Close,GOOGL_Adj Close,META_Adj Close,MSFT_Adj Close,TSLA_Adj Close,AAPL_Close,AMZN_Close,GOOGL_Close,...,GOOGL_Open,META_Open,MSFT_Open,TSLA_Open,AAPL_Volume,AMZN_Volume,GOOGL_Volume,META_Volume,MSFT_Volume,TSLA_Volume
0,2021-01-04,126.544212,159.331497,86.093323,268.132690,210.423111,243.256668,129.410004,159.331497,86.306503,...,88.000000,274.779999,222.529999,239.820007,143301900.0,88228000.0,37324000.0,15106100.0,37130100.0,145914600.0
1,2021-01-05,128.108780,160.925507,86.787598,270.156586,210.626068,245.036667,131.009995,160.925507,87.002502,...,86.254501,268.290009,217.259995,241.220001,97664900.0,53110000.0,20360000.0,9871600.0,23823000.0,96735600.0
2,2021-01-06,123.796448,156.919006,85.931213,262.519592,205.164688,251.993332,126.599998,156.919006,86.143997,...,85.013000,262.000000,212.169998,252.830002,155088000.0,87896000.0,46588000.0,24354100.0,35930700.0,134100000.0
3,2021-01-07,128.020767,158.108002,88.497871,267.933289,211.003067,272.013336,130.919998,158.108002,88.717003,...,86.337997,265.899994,214.039993,259.209991,109578200.0,70290000.0,41936000.0,15789800.0,27694500.0,154496700.0
4,2021-01-08,129.125763,159.134995,89.669464,266.766815,212.288696,293.339996,132.050003,159.134995,89.891502,...,88.858002,268.309998,218.679993,285.333344,105158200.0,70754000.0,35484000.0,18528300.0,22956200.0,225166500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,2024-11-22,229.869995,197.119995,164.759995,559.140015,417.000000,352.559998,229.869995,197.119995,164.759995,...,165.850006,563.549988,411.369995,341.089996,38168300.0,31530800.0,38604600.0,9164000.0,24814600.0,89140700.0
980,2024-11-25,232.869995,201.449997,167.649994,565.109985,418.790009,338.589996,232.869995,201.449997,167.649994,...,166.089996,562.099976,418.380005,360.140015,90152800.0,40685700.0,33135300.0,13599800.0,27691100.0,95890900.0
981,2024-11-26,235.059998,207.860001,169.119995,573.539978,427.989990,338.230011,235.059998,207.860001,169.119995,...,167.630005,566.000000,419.589996,341.000000,45986200.0,41673700.0,20486700.0,10356600.0,23458900.0,62295900.0
982,2024-11-27,234.929993,205.740005,169.229996,569.200012,422.989990,332.890015,234.929993,205.740005,169.229996,...,169.000000,574.890015,425.109985,341.799988,33498400.0,28061600.0,19266500.0,7200200.0,18332400.0,57896400.0


In [167]:
combined_sentiments = combined_sentiments.rename(columns={'date': 'Date'})
# Merge the DataFrames on 'date'
merged_data = pivoted_data.merge(combined_sentiments, on='Date', how='inner')


In [168]:
merged_data

Unnamed: 0,Date,AAPL_Adj Close,AMZN_Adj Close,GOOGL_Adj Close,META_Adj Close,MSFT_Adj Close,TSLA_Adj Close,AAPL_Close,AMZN_Close,GOOGL_Close,...,META_Open,MSFT_Open,TSLA_Open,AAPL_Volume,AMZN_Volume,GOOGL_Volume,META_Volume,MSFT_Volume,TSLA_Volume,sentiment
0,2021-11-15,147.582474,177.283997,148.085312,346.516693,326.976105,337.796661,150.000000,177.283997,148.451996,...,344.339996,337.540009,339.209991,59222800.0,58594000.0,23508000.0,25076600.0,16723000.0,104326800.0,0.064286
1,2021-11-26,154.282730,175.227997,141.831802,332.120026,321.345825,360.640015,156.809998,175.227997,142.182999,...,335.799988,334.350006,366.489990,76959800.0,59826000.0,30470000.0,14750700.0,24217200.0,35042700.0,0.233333
2,2021-11-30,162.635895,175.353500,141.547012,323.486023,322.232849,381.586670,165.300003,175.353500,141.897507,...,335.000000,335.320007,381.456665,174048100.0,80022000.0,42068000.0,25390000.0,42885600.0,81276000.0,0.000000
3,2021-12-01,162.114471,172.186005,140.703094,309.667664,321.735748,365.000000,164.770004,172.186005,141.051498,...,330.290009,335.130005,386.899994,152052500.0,74916000.0,34020000.0,30329600.0,33337600.0,68450400.0,0.500000
4,2021-12-03,159.231659,169.489502,141.650757,305.918915,314.844543,338.323334,161.839996,169.489502,142.001495,...,313.730011,331.989990,361.596680,118023100.0,80712000.0,41230000.0,27471000.0,41779300.0,92322000.0,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,2024-11-22,229.869995,197.119995,164.759995,559.140015,417.000000,352.559998,229.869995,197.119995,164.759995,...,563.549988,411.369995,341.089996,38168300.0,31530800.0,38604600.0,9164000.0,24814600.0,89140700.0,0.094088
415,2024-11-25,232.869995,201.449997,167.649994,565.109985,418.790009,338.589996,232.869995,201.449997,167.649994,...,562.099976,418.380005,360.140015,90152800.0,40685700.0,33135300.0,13599800.0,27691100.0,95890900.0,0.093677
416,2024-11-26,235.059998,207.860001,169.119995,573.539978,427.989990,338.230011,235.059998,207.860001,169.119995,...,566.000000,419.589996,341.000000,45986200.0,41673700.0,20486700.0,10356600.0,23458900.0,62295900.0,0.502253
417,2024-11-27,234.929993,205.740005,169.229996,569.200012,422.989990,332.890015,234.929993,205.740005,169.229996,...,574.890015,425.109985,341.799988,33498400.0,28061600.0,19266500.0,7200200.0,18332400.0,57896400.0,0.084842


In [169]:
merged_data.columns

Index(['Date', 'AAPL_Adj Close', 'AMZN_Adj Close', 'GOOGL_Adj Close',
       'META_Adj Close', 'MSFT_Adj Close', 'TSLA_Adj Close', 'AAPL_Close',
       'AMZN_Close', 'GOOGL_Close', 'META_Close', 'MSFT_Close', 'TSLA_Close',
       'AAPL_High', 'AMZN_High', 'GOOGL_High', 'META_High', 'MSFT_High',
       'TSLA_High', 'AAPL_Low', 'AMZN_Low', 'GOOGL_Low', 'META_Low',
       'MSFT_Low', 'TSLA_Low', 'AAPL_Open', 'AMZN_Open', 'GOOGL_Open',
       'META_Open', 'MSFT_Open', 'TSLA_Open', 'AAPL_Volume', 'AMZN_Volume',
       'GOOGL_Volume', 'META_Volume', 'MSFT_Volume', 'TSLA_Volume',
       'sentiment'],
      dtype='object')

In [171]:
##  Average price movement as a target
merged_data['price_movement_mean'] = merged_data[[f'{ticker}_Close' for ticker in tickers]].mean(axis=1) - \
                                     merged_data[[f'{ticker}_Open' for ticker in tickers]].mean(axis=1)

# Conversion to binary classification (1 for uptrend, 0 for downtrend)
merged_data['price_movement'] = (merged_data['price_movement_mean'] > 0).astype(int)


In [172]:
merged_data['price_change_mean'] = merged_data[[f'{ticker}_Close' for ticker in tickers]].mean(axis=1) - \
                                   merged_data[[f'{ticker}_Open' for ticker in tickers]].mean(axis=1)


In [173]:
merged_data['volume_mean'] = merged_data[[f'{ticker}_Volume' for ticker in tickers]].mean(axis=1)

In [174]:
merged_data['price_change_max'] = merged_data[[f'{ticker}_Close' for ticker in tickers]].max(axis=1) - \
                                   merged_data[[f'{ticker}_Open' for ticker in tickers]].max(axis=1)

In [178]:
merged_data=merged_data[['price_change_mean', 'volume_mean', 'price_change_max', 'sentiment','price_movement']]

In [179]:
merged_data

Unnamed: 0,price_change_mean,volume_mean,price_change_max,sentiment,price_movement
0,-0.007973,4.790853e+07,3.220001,0.064286,0
1,-3.833748,4.021107e+07,-5.849976,0.233333,0
2,-2.627085,7.428162e+07,0.130005,0.000000,0
3,-9.560417,6.551768e+07,-21.899994,0.500000,0
4,-7.710559,6.692290e+07,-23.273346,0.200000,0
...,...,...,...,...,...
414,2.046669,3.857050e+07,-4.409973,0.094088,1
415,-2.165003,5.019260e+07,3.010010,0.093677,0
416,3.724996,3.404300e+07,7.539978,0.502253,1
417,-2.878329,2.737592e+07,-5.690002,0.084842,0


In [175]:
# Define features and target
features = ['price_change_mean', 'volume_mean', 'price_change_max', 'sentiment']
X = merged_data[features]
y = merged_data['price_movement']

# Train-test split (time-based)
train_size = int(0.8 * len(merged_data))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [182]:
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the suitable model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply K-Folds cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation

# Print cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [1.         1.         0.98809524 1.         1.        ]
Mean cross-validation score: 0.9976190476190476


In [187]:
# Define the model
model = RandomForestClassifier(random_state=42)

# Set hyperparameter grid
param_grid = {
    'n_estimators': [50],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X, y)

# Print best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation score: 0.9976190476190476


In [189]:
## fitting the training  data into model
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        36

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [192]:
## print accuracy of model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000
