In [60]:
import pandas as pd
from datetime import datetime, timedelta

'''getting the reddit data'''

# Read CSV into DataFrame 
df = pd.read_csv('merged.csv')

# Convert timestamp from UTC to Central Time (UTC-6)
df['Converted_Timestamp'] = df['timestamp'].apply(lambda x: datetime.utcfromtimestamp(x) - timedelta(hours=6))

# Save DataFrame back to CSV
df.to_csv('converted_file.csv', index=False)



In [61]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,title,score,comms_num,body,timestamp,Converted_Timestamp
0,0,m5vgai,Don't use Twitter much and just saw this,1,0,,1615848418,2021-03-15 16:46:58
1,1,m5vfwf,I still like the stock.. 💎🖐,22,79,,1615848129,2021-03-15 16:42:09
2,2,m5vdrb,Translation: What magic did you do at first si...,1,20,,1615846781,2021-03-15 16:19:41
3,3,m5vagz,tendies when?,1,0,,1615844627,2021-03-15 15:43:47
4,4,m5va2r,When dips don’t scare you anymore and you’re j...,1,0,,1615844381,2021-03-15 15:39:41


In [62]:
'''Data Cleaning'''

# Sorting the Dataframe in ascending based on converted_timestamp
filtered_df = df.sort_values('Converted_Timestamp', ascending= True)
filtered_df.head(5)


Unnamed: 0.1,Unnamed: 0,id,title,score,comms_num,body,timestamp,Converted_Timestamp
57361,57361,j2vrt3,wheres the guy who was gonna post positions fo...,1,7,he said he had a lot of success rhen hust neve...,1601503412,2020-09-30 16:03:32
57360,57360,j2vsqb,Made 600 Euros in one day playing crash on App...,1,0,,1601503494,2020-09-30 16:04:54
57359,57359,j2vvjb,$WKHS was added to GSA. Interesting Development.,1,0,[removed],1601503747,2020-09-30 16:09:07
57358,57358,j2w07x,"Careful, 🌈🌈🌈🐻🐻 are in the area.",1,0,,1601504177,2020-09-30 16:16:17
57357,57357,j2w0et,All in on $SSSS,1,0,,1601504193,2020-09-30 16:16:33


In [63]:
'''Using NLP to add columns '''

# Data Processing
from tqdm import tqdm
tqdm.pandas()
import spacy


# Initialize spaCy
nlp = spacy.load("en_core_web_sm")
def detect_entities(text):
    doc = nlp(text)
    org_entities = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
    return org_entities

# Applying NLP to detect Named Entitiy Classification for ORG in title
filtered_df['NER_ORG'] = filtered_df['title'].progress_apply(detect_entities)

print(filtered_df.head(10))

100%|██████████| 382774/382774 [24:55<00:00, 256.00it/s]


       Unnamed: 0      id                                              title  \
57361       57361  j2vrt3  wheres the guy who was gonna post positions fo...   
57360       57360  j2vsqb  Made 600 Euros in one day playing crash on App...   
57359       57359  j2vvjb   $WKHS was added to GSA. Interesting Development.   
57358       57358  j2w07x                    Careful, 🌈🌈🌈🐻🐻 are in the area.   
57357       57357  j2w0et                                    All in on $SSSS   
57356       57356  j2w3l1  Another up day tomorrow Democrat’s hold off on...   
57355       57355  j2w3ws       WSB only wants one thing and it’s disgusting   
57354       57354  j2w4r5                           TMR green day confirmed!   
57353       57353  j2w51y                        ELON MUSK GAY TESLA STOCKER   
57352       57352  j2w5go                       Buying long term puts on AAL   

       score  comms_num                                               body  \
57361      1          7  he said he had a

In [70]:
'''
Adding columns to determine AMC, TSLA, GME in NER_ORG column
'''

import pandas as pd

tesla_entities = ["tsla", "tesla", "TSLA"]
gme_entities = ["gme", "gamestop", "GME"]
amc_entities = ["amc", "AMC"]

def contains_specific_ticker(tickers, specific_ticker):
    if tickers is None or not isinstance(tickers, list):
        return False
    return any(ticker in specific_ticker for ticker in tickers)

# Add a new column for each ticker
filtered_df['TSLA'] = filtered_df['NER_ORG'].apply(lambda x: contains_specific_ticker(x, tesla_entities))
filtered_df['GME'] = filtered_df['NER_ORG'].apply(lambda x: contains_specific_ticker(x, gme_entities))
filtered_df['AMC'] = filtered_df['NER_ORG'].apply(lambda x: contains_specific_ticker(x, amc_entities))

# Display rows where at least one of the conditions is True
matching_rows = filtered_df[(filtered_df['TSLA'] == True) | (filtered_df['GME'] == True) | (filtered_df['AMC'] == True)]

# Save the matching rows to a CSV file
matching_rows.to_csv("matching_rows.csv", index=False)

# To display the DataFrame in the notebook
display(matching_rows)




Unnamed: 0.1,Unnamed: 0,id,title,score,comms_num,body,timestamp,Converted_Timestamp,NER_ORG,TSLA,GME,AMC
57250,57250,j30yoj,GME More information on Ryan Cohen,1,30,\n\nCredit for this information goes to Just...,1601522978,2020-09-30 21:29:38,[GME],False,True,False
57426,57426,j42447,What the fuck is going on with GME?,1,132,Can we get an actual discussion regarding what...,1601669797,2020-10-02 14:16:37,[GME],False,True,False
59261,59261,j5do1u,The Bullish Sentiment of GME,1,25,I've spent the past several days reading artic...,1601874636,2020-10-04 23:10:36,"[The Bullish Sentiment, GME]",False,True,False
58994,58994,j5pj8x,Im starting to doubt GME,1,24,Will gme ever break out? All i see is an endle...,1601924457,2020-10-05 13:00:57,[GME],False,True,False
60482,60482,j6sejq,"AMC, to good to be true?",1,4,Amc options are currently trading for break ev...,1602082831,2020-10-07 09:00:31,[AMC],False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
380929,380929,n5qzm0,Real Price of GME is 99k on IEX -aka without h...,1,2,,1620249806,2021-05-05 15:23:26,"[GME, IEX]",False,True,False
380925,380925,n5r09n,$GME old CFO has resurfaced at Backcountry.,1,12,,1620249858,2021-05-05 15:24:18,"[GME, CFO, Backcountry]",False,True,False
380879,380879,n5rg70,Theory on GME,1,1,we win.,1620251044,2021-05-05 15:44:04,[GME],False,True,False
380863,380863,n5rmse,The real price of GME is 99k on IEX -aka witho...,1,3,,1620251548,2021-05-05 15:52:28,"[GME, IEX]",False,True,False


In [1]:
'''
Getting Yahoo Finance Data for daily stock price and trading volume for GME, TSLA and AMC
during 2020 - 2021. 
'''

import yfinance as yf
import pandas as pd

# List of tickers you're interested in
tickers = ['AMC', 'TSLA', 'GME']

# Initialize an empty DataFrame to store all data combined
combined_df = pd.DataFrame()

# Loop through each ticker and fetch the data
for ticker in tickers:
    stock_info = yf.Ticker(ticker)
    try:
        # Get historical data
        historical_data = stock_info.history(start="2020-01-01", end="2021-12-31")
        
        # Add a 'Ticker' column to the historical data
        historical_data['Ticker'] = ticker

        # Append to the combined DataFrame
        combined_df = combined_df.append(historical_data)
        
    except Exception as e:
        print(f"Could not fetch data for {ticker}: {e}")

# Save all data combined into a single CSV
combined_df.to_csv("combined_historical_data.csv")


  combined_df = combined_df.append(historical_data)
  combined_df = combined_df.append(historical_data)
  combined_df = combined_df.append(historical_data)


In [2]:
display(combined_df)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02 00:00:00-05:00,72.955489,75.553902,72.255919,74.554512,454590,0.0,0.0,AMC
2020-01-03 00:00:00-05:00,74.454575,75.953661,73.055427,73.155365,221800,0.0,0.0,AMC
2020-01-06 00:00:00-05:00,72.455796,72.955491,71.056648,71.156586,290340,0.0,0.0,AMC
2020-01-07 00:00:00-05:00,71.056648,71.956101,69.957320,71.256531,320530,0.0,0.0,AMC
2020-01-08 00:00:00-05:00,71.056643,71.456401,66.059692,66.159630,655700,0.0,0.0,AMC
...,...,...,...,...,...,...,...,...
2021-12-23 00:00:00-05:00,38.500000,38.750000,36.505001,38.035000,4222000,0.0,0.0,GME
2021-12-27 00:00:00-05:00,38.000000,38.154999,35.000000,37.077499,6454400,0.0,0.0,GME
2021-12-28 00:00:00-05:00,36.875000,39.352501,36.602501,36.615002,5324400,0.0,0.0,GME
2021-12-29 00:00:00-05:00,36.962502,38.872501,35.535000,38.482498,8149600,0.0,0.0,GME
