# Creating and updating the daily data CSV s

In [56]:
import yfinance as yf
import pandas as pd
import datetime
import os
import schedule
import time

def fetch_daily_data(ticker, start_date, end_date):
    df = yf.download(ticker, start=start_date, end=end_date, interval='1d')
    return df

def update_daily_data_for_ticker(ticker):
    # Determine the CSV file name
    csv_file_name = ticker + '_Daily_Data.csv'

    if not os.path.exists(csv_file_name):
        # Fetch data from '2013-01-01' as initial data
        end_date = datetime.datetime.now().date()
        start_date = '2013-01-01'
        daily_data = fetch_daily_data(ticker, start_date, end_date)
        
        # Reset index and extract date as a separate column
        daily_data.reset_index(inplace=True)
        daily_data['Date'] = daily_data['Date'].dt.date
        
        # Save the DataFrame to a CSV file if the file doesn't exist
        daily_data.to_csv(csv_file_name, index=False)
        print(f"{len(daily_data)} new records added to a new {csv_file_name} file.")
    else:
        # Load existing data from CSV
        existing_df = pd.read_csv(csv_file_name)
        
        # Get the last date from the existing data
        latest_date = pd.to_datetime(existing_df['Date']).max()
        
        # Convert the latest_date to a Pandas Timestamp
        latest_date = pd.Timestamp(latest_date)
        
        # Define date range for fetching new data
        end_date = datetime.datetime.now().date()
        start_date = (latest_date + datetime.timedelta(days=1)).date()
        
        if start_date < end_date:
            # Fetch daily data
            daily_data = fetch_daily_data(ticker, start_date, end_date)
            
            # Reset index and extract date as a separate column
            daily_data.reset_index(inplace=True)
            daily_data['Date'] = daily_data['Date'].dt.date
            
            # Append new data to the CSV file
            daily_data.to_csv(csv_file_name, mode='a', index=False, header=False)
            print(f"{len(daily_data)} new records added to the {csv_file_name} file.")
        else:
            print(f"No new records to add for {ticker}.")

# List of stock symbols
symbols = ["AAPL", "ORCL", "MSFT"]  # Add more symbols as needed

# Function to perform the initial data update for all symbols
def initial_data_update():
    for symbol in symbols:
        update_daily_data_for_ticker(symbol)

# Schedule the automatic update daily
for symbol in symbols:
    schedule.every().day.at("00:10").do(update_daily_data_for_ticker, symbol)

# Perform the initial data update
initial_data_update()

# Run the scheduled tasks
while True:
    schedule.run_pending()
    time.sleep(1)  # Sleep for 24 hours

No new records to add for AAPL.
No new records to add for ORCL.
No new records to add for MSFT.


AttributeError: 'datetime.datetime' object has no attribute 'datetime'

In [22]:
import pandas as pd

ticker = "MSFT"
csv_file = f"{ticker}_Daily_Data.csv"
df = pd.read_csv(csv_file)

# # Number of rows to delete
# num_rows_to_delete = 3

# # Delete the last N rows
# df = df.iloc[:-num_rows_to_delete]

# # Save the modified DataFrame back to the CSV file
# df.to_csv(csv_file, index=False)

# print("Last", num_rows_to_delete, "rows deleted from the "+ csv_file +" CSV.")

df.tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
2674,2023-08-17,320.540009,321.869995,316.209991,316.880005,316.880005,21257200
2675,2023-08-18,314.48999,318.380005,311.549988,316.480011,316.480011,24744800
2676,2023-08-21,317.929993,322.769989,317.040009,321.880005,321.880005,24040000
2677,2023-08-22,325.5,326.079987,321.459991,322.459991,322.459991,16102000
2678,2023-08-23,323.820007,329.200012,323.459991,327.0,327.0,21166400
2679,2023-08-24,332.850006,332.980011,319.959991,319.970001,319.970001,23281400
2680,2023-08-25,321.470001,325.359985,318.799988,322.980011,322.980011,21671400
2681,2023-08-28,325.660004,326.149994,321.721985,323.820007,323.820007,10588855
2682,2023-08-29,321.880005,328.98349,321.940002,327.5,327.5,10393681
2683,2023-08-30,328.670013,329.809998,326.450012,328.790009,328.790009,15205500


# Adding a price change column to the dataframe 

In [289]:
import pandas as pd

ticker = "ORCL"
df_price = pd.read_csv(f"{ticker}_Daily_Data.csv")

# Calculate the price change compared to the previous date
df_price['Price Change'] = df_price['Close'].diff()

# Add a new column to indicate if the price increased (1) or decreased (0)
df_price['Price Increase'] = (df_price['Price Change'] > 0).astype(int)

df_price.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price Change,Price Increase
2678,2023-08-23,117.099998,120.449997,112.110001,117.839996,117.839996,16517300,1.299995,1
2679,2023-08-24,119.68,119.730003,112.410004,112.910004,112.910004,11995900,-4.929993,0
2680,2023-08-25,112.910004,116.150002,112.739998,116.059998,116.059998,8757400,3.149994,1
2681,2023-08-28,116.370003,117.709999,115.699997,116.860001,116.860001,3922549,0.800003,1
2682,2023-08-29,120.949997,121.07,118.529999,120.599998,120.599998,8105576,3.739998,1


# Reading News data file

In [290]:
pd.read_csv(f"{ticker}_News_Content.csv").tail()

Unnamed: 0,title,time_published,summary,overall_sentiment_score,overall_sentiment_label,ticker,relevance_score,ticker_sentiment_score,ticker_sentiment_label
581,Oracle ( ORCL ) Stock Moves -0.04%: What You...,20230822T214515,Oracle (ORCL) closed at $116.54 in the latest ...,0.175957,Somewhat-Bullish,ORCL,0.631282,0.308694,Somewhat-Bullish
582,How Is The Market Feeling About Oracle? - Orac...,20230825T143018,Oracle's ORCL short percent of float has falle...,0.241618,Somewhat-Bullish,ORCL,0.224903,0.085754,Neutral
583,Open Source Services Market is Anticipated to ...,20230828T120936,"New York, USA, Aug. 28, 2023 ( GLOBE NEWSWIRE ...",0.218068,Somewhat-Bullish,ORCL,0.033609,0.152613,Somewhat-Bullish
584,Oracle ( ORCL ) Outpaces Stock Market Gains:...,20230828T214512,Oracle (ORCL) closed at $116.84 in the latest ...,0.233317,Somewhat-Bullish,ORCL,0.617188,0.375819,Bullish
585,Why Oracle Stock Trounced the Market Today,20230829T211946,"The veteran tech company's stock is now a buy,...",0.324043,Somewhat-Bullish,ORCL,0.722944,0.586185,Bullish


# Appending same day news data together

In [28]:
df = pd.read_csv(f"{ticker}_News_Content.csv")

# Filter rows with relevance_score > 0.5
df = df[df['relevance_score'] > 0.5]

# Extract date and time components
df['Date'] = pd.to_datetime(df['time_published'], format='%Y%m%dT%H%M%S').dt.date
df['Time'] = pd.to_datetime(df['time_published'], format='%Y%m%dT%H%M%S').dt.time

# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Group by date and concatenate title and summary
df_news = df.groupby('Date').agg({'title': ' '.join, 'summary': ' '.join}).reset_index()

df_news.tail()

Unnamed: 0,Date,title,summary
441,2023-08-27,A Bull Market Is Coming: 2 Artificial Intellig...,Wall Street billionaire Ken Griffin is exceedi...
442,2023-08-28,"Not possible to slow tech pace, answer is in s...",AI - which will increasingly have a positive i...
443,2023-08-29,Microsoft Memo Leak Reveals Guidance For Manag...,Microsoft Corp. MSFT has made significant chan...
444,2023-08-30,Microsoft ( MSFT ) Gains But Lags Market: Wh...,"In the latest trading session, Microsoft (MSFT..."
445,2023-08-31,Microsoft to unbundle Teams software in Europe...,European Union regulators had in July opened a...


# Appending a price change column

In [45]:
df_price_change = pd.read_csv(f"{ticker}_Daily_Data.csv")

# Calculate the price change compared to the previous date
df_price_change['Price Change'] = df_price_change['Close'].diff()

# Add a new column to indicate if the price increased (1) or decreased (0)
df_price_change['Price Increase'] = (df_price_change['Price Change'] > 0).astype(int)

# Convert 'Date' column to datetime type in both dataframes
df_price_change['Date'] = pd.to_datetime(df_price_change['Date'])
df_news['Date'] = pd.to_datetime(df_news['Date'])

# Perform a left outer join based on the 'Date' column
merged_df = pd.merge(df_price_change, df_news, on='Date')

# Display the merged dataframe
merged_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price Change,Price Increase,title,summary
355,2023-08-24,332.850006,332.980011,319.959991,319.970001,319.970001,23281400,-7.029999,0,Verizon Business Wins Best Microsoft Teams Sol...,Verizon Mobile for Microsoft Teams seen as bes...
356,2023-08-25,321.470001,325.359985,318.799988,322.980011,322.980011,21671400,3.01001,1,China-based 'Flax Typhoon' Hackers Targeting T...,China-based 'Flax Typhoon' Hackers Targeting T...
357,2023-08-28,325.660004,326.149994,321.721985,323.820007,323.820007,10588855,0.839996,1,"Not possible to slow tech pace, answer is in s...",AI - which will increasingly have a positive i...
358,2023-08-29,321.880005,328.98349,321.940002,327.5,327.5,10393681,3.679993,1,Microsoft Memo Leak Reveals Guidance For Manag...,Microsoft Corp. MSFT has made significant chan...
359,2023-08-30,328.670013,329.809998,326.450012,328.790009,328.790009,15205500,1.290009,1,Microsoft ( MSFT ) Gains But Lags Market: Wh...,"In the latest trading session, Microsoft (MSFT..."


In [46]:
# Shift the column by one row
shifted_column = merged_df['Price Increase'].shift(-1)

# Fill NaN values with 0
shifted_column = shifted_column.fillna(0)

# Convert to integers
shifted_column = shifted_column.astype(int)

# Add the shifted column back to the DataFrame
merged_df['shifted_Price_Increase'] = shifted_column

merged_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price Change,Price Increase,title,summary,shifted_Price_Increase
0,2022-03-07,288.529999,289.690002,278.529999,278.910004,275.015656,43157200,-10.949982,0,Microsoft announces setting up of its fourth d...,"Situated in Hyderabad, this will be the Micros...",0
1,2022-03-08,277.799988,283.959991,270.0,275.850006,271.998413,48159500,-3.059998,0,Google to buy cybersecurity firm Mandiant for ...,Google's offer of $23 per share is at a premiu...,1
2,2022-03-09,283.440002,289.600006,280.779999,288.5,284.471741,35204500,12.649994,1,Ralliton the First Company in Cyprus to Receiv...,"Ralliton, the award-winning managed cloud serv...",0
3,2022-03-10,283.019989,286.600006,280.579987,285.589996,281.602356,30628000,-2.910004,0,AvePoint Unveils Entrust to Help Organizations...,"JERSEY CITY, N.J., March 10, 2022 ( GLOBE N...",0
4,2022-03-11,287.959991,289.51001,279.429993,280.070007,276.159454,27209300,-5.519989,0,Artificial Intelligence in Supply Chain Market...,The market is driven by factors such as the ri...,0


# Sentiment analysis of collected news data

In [54]:
train_size = int(len(merged_df) * 0.8)
train, test = merged_df[0:train_size], merged_df[train_size:]

train.tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price Change,Price Increase,title,summary,shifted_Price_Increase
278,2023-04-26,296.700012,299.570007,292.730011,295.369995,294.103027,64599200,19.949982,1,"Microsoft reports 9% boost in profits, revenue...",Microsoft on Tuesday reported a 9% increase in...,1
279,2023-04-27,295.970001,305.200012,295.25,304.829987,303.52243,46462600,9.459991,1,Ray Dalio Perplexed By Bitcoin's Popularity Vs...,Billionaire investor Ray Dalio has spoken out ...,1
280,2023-04-28,304.01001,308.929993,303.309998,307.26001,305.942047,36446700,2.430023,1,Microsoft is betting its future on cloud gamin...,U.K. regulators fears that Microsoft could in ...,0
281,2023-05-01,306.970001,308.600006,305.149994,305.559998,304.249329,21294100,-1.700012,0,Microsoft Corporation ( MSFT ) is Attracting...,Microsoft (MSFT) has received quite a bit of a...,0
282,2023-05-02,307.76001,309.179993,303.910004,305.410004,304.099945,26404400,-0.149994,0,Apple Vs Microsoft: Safari Overtakes Edge As M...,Apple Inc.'s AAPL Safari overtook Microsoft Co...,0
283,2023-05-03,306.619995,308.609985,304.089996,304.399994,303.094299,22360800,-1.01001,0,"CEOs of Microsoft, Alphabet called to AI meeti...",Vice President Kamala Harris will reportedly h...,1
284,2023-05-04,306.23999,307.76001,303.399994,305.410004,304.099945,22519900,1.01001,1,"AI-powered Bing ditches waitlist, adds images ...",A screenshot of the AI-powered Bing delivering...,1
285,2023-05-05,305.720001,311.970001,304.269989,310.649994,309.317505,28181200,5.23999,1,Check Out What Whales Are Doing With MSFT - Mi...,A whale with a lot of money to spend has taken...,0
286,2023-05-08,310.130005,310.200012,306.089996,308.649994,307.32605,21318600,-2.0,0,Quisitive Sets First Quarter 2023 Earnings Cal...,"TORONTO, May 08, 2023 ( GLOBE NEWSWIRE ) -- Qu...",0
287,2023-05-09,308.0,310.040009,306.309998,307.0,305.683136,21340800,-1.649994,0,Google Cloud makes AI 'land grab' with wave of...,Alphabet Inc.'s Google Cloud on Tuesday made i...,1


In [48]:
#preparing the train dataset

#Removing punctuations of title and summary columns
df_train = train.iloc[:, :11]
df_train = df_train.replace("[^a-zA-Z]", " ", regex=True)

# Convert 'title' column to lowercase
df_train['title'] = df_train['title'].str.lower()
# Convert 'summary' column to lowercase
df_train['summary'] = df_train['summary'].str.lower()

train_summary_list = df_train['summary'].tolist()

print(train_summary_list[0])

situated in hyderabad  this will be the microsoft s largest data centre in the country shares of mandiant  a      billion cybersecurity firm  closed up     after a report said google is interested in acquiring the company  google    nasdaq  googl    is in talks to purchase cybersecurity behemoth mandiant    nasdaq  mndt     the information reported monday  the cybersecurity stock took off late in the regular trading session after the report was published  and mandiant ended the trading day        higher at          th


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

#implementing bag-of-words
countVector = CountVectorizer(ngram_range=(3,3))
trainDataset = countVector.fit_transform(train_summary_list)

In [50]:
#implementing random forest classifier
randomClassifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
randomClassifier.fit(trainDataset, train['shifted_Price_Increase'])

RandomForestClassifier(criterion='entropy', n_estimators=200)

In [51]:
#predict for the test dataset

#Removing punctuations of title and summary columns
df_test = test.iloc[:, :11]
df_test = df_test.replace("[^a-zA-Z]", " ", regex=True)

# Convert 'title' column to lowercase
df_test['title'] = df_test['title'].str.lower()
# Convert 'summary' column to lowercase
df_test['summary'] = df_test['summary'].str.lower()

test_summary_list = df_test['summary'].tolist()

print(test_summary_list[0])

twitter gets new dm feature  voice and video chat to be added soon     business standard     the deal signifies a further bid by microsoft to ramp up its efforts in ai  which has become a key focus for the company  indian company helps clients make apps without any coding experience     in a deal believed to be the first commercial agreement for fusion power  the tech giant has agreed to purchase electricity from helion energy a startup backed by openai founder sam altman within about five years  microsoft corp msft shares are trading higher wednesday after it agreed to purchase electricity from startup helion energy in       marking the first commercial deal for fusion power  constellation energy corp ceg will be the power marketer and manage project transmission  microsoft is quickly becoming the best stock to invest in for exposure to artificial intelligence  now that it s in cost saving mode  microsoft is foregoing salary increases this year after giving raises to some employees in

In [52]:
testDataset = countVector.transform(test_summary_list)
predictions = randomClassifier.predict(testDataset)
print(predictions)

[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [53]:
#import library to check the accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

matrix = confusion_matrix(test['shifted_Price_Increase'], predictions)
print(matrix)
score = accuracy_score(test['shifted_Price_Increase'], predictions)
print(score)
report = classification_report(test['shifted_Price_Increase'], predictions)
print(report)

[[31  0]
 [40  1]]
0.4444444444444444
              precision    recall  f1-score   support

           0       0.44      1.00      0.61        31
           1       1.00      0.02      0.05        41

    accuracy                           0.44        72
   macro avg       0.72      0.51      0.33        72
weighted avg       0.76      0.44      0.29        72



# Creating a new dataframe with prices and news data concatenated

In [119]:
# import pandas as pd

# # Assuming df1 and df2 are your two dataframes
# # You can replace these with your actual dataframes
# df1 = df_price
# df2 = df_news

# # Convert 'Date' column to datetime type in both dataframes
# df_price['Date'] = pd.to_datetime(df_price['Date'])
# df_news['Date'] = pd.to_datetime(df_news['Date'])

# # Perform a left outer join based on the 'Date' column
# merged_df = pd.merge(df_price, df_news, on='Date', how='left')

# # Display the merged dataframe
# merged_df.tail()