# Set Up


In [84]:
#refesh python script
#get_ipython().kernel.do_shutdown(restart=True)

#Check GPUs
##!nvidia-smi

In [85]:
# Install nbimporter if not already installed
# !pip install import-ipynb
# import import_ipynb

#load cudf to use GPUs for analysis
%load_ext cudf.pandas

# import pandas
import pandas as pd

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


# Get Stock Data

## Download Stock Data

In [86]:
#Download the stock data
##!if [ ! -f "usa_stocks_30m.parquet" ]; then curl https://storage.googleapis.com/rapidsai/colab-data/usa_stocks_30m.parquet -o usa_stocks_30m.parquet; else echo "usa_stocks_30m.parquet found"; fi

In [87]:
#move the stock data to my Drive
##!mv usa_stocks_30m.parquet "/content/drive/MyDrive/Colab Notebooks"

In [88]:
# # Add my drive to the sys.path
# # Replace 'My Drive/Colab Notebooks' with the actual path to the directory
# # within your Google Drive where Correlation_Coefficient.ipynb is located.
# import sys
# sys.path.append('/content/drive/My Drive/Colab Notebooks')

# # Verify that the directory is in sys.path
# print("sys.path after appending:")
# print(sys.path)

# # List the contents of the directory to check if the file exists
# print("\nContents of the directory:")
# !ls "/content/drive/My Drive/Colab Notebooks"

# # Check if the notebook is recognized as a module
# print("\nChecking for Correlation_Coefficient in sys.modules:")
# print('Correlation_Coefficient' in sys.modules)

## Pull data from My Drive

In [89]:
# Define nasdaq_stocks as the stock data
# Specify the full path to the file in your Google Drive
nasdaq_stocks = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/usa_stocks_30m.parquet")

In [90]:
#Cut down the data first 18M lines because of RAM limits on
#free tier of colab
#df = df.iloc[:18000000]

## Add columns and Fitler last 5 years

In [91]:
#add year, month, and day columns
#df[["year", "week", "day"]] = df.datetime.dt.isocalendar()
nasdaq_stocks["year"] = nasdaq_stocks.datetime.dt.year
nasdaq_stocks["month"] = nasdaq_stocks.datetime.dt.month
nasdaq_stocks["day"] = nasdaq_stocks.datetime.dt.day

In [92]:
# Filter the DataFrame based on the 'year' column
nasdaq_stocks_last_5y = nasdaq_stocks.loc[nasdaq_stocks['year'] >= 2020]

nasdaq_stocks_last_5y.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 7212849 entries, 65555 to 36919
Data columns (total 10 columns):
 #   Column    Dtype
---  ------    -----
 0   datetime  datetime64[ms]
 1   open      float64
 2   high      float64
 3   low       float64
 4   close     float64
 5   volume    int64
 6   ticker    object
 7   year      int16
 8   month     int16
 9   day       int16
dtypes: datetime64[ms](1), float64(4), int16(3), int64(1), object(1)
memory usage: 475.8+ MB


In [93]:
nasdaq_stocks_last_5y.head()

Unnamed: 0,datetime,open,high,low,close,volume,ticker,year,month,day
65555,2020-01-02 16:30:00,85.9,86.35,85.2,85.99,163797,A,2020,1,2
65556,2020-01-02 17:00:00,86.01,86.01,85.36,85.54,49377,A,2020,1,2
65557,2020-01-02 17:30:00,85.54,85.54,85.31,85.44,46574,A,2020,1,2
65558,2020-01-02 18:00:00,85.43,85.61,85.39,85.45,47512,A,2020,1,2
65559,2020-01-02 18:30:00,85.47,85.47,85.24,85.43,58244,A,2020,1,2


## Filter for daily close values

In [94]:
#aggregate data by ticker, month, day, closing cost.
nasdaq_stocks_last_5y_aggregated_close = nasdaq_stocks_last_5y.groupby(["ticker", "year", "month", "day"]).agg({"close": "last"})

nasdaq_stocks_last_5y_aggregated_close.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,close
ticker,year,month,day,Unnamed: 4_level_1
A,2020,1,2,85.91
A,2020,1,3,84.64
A,2020,1,6,84.8
A,2020,1,7,85.09
A,2020,1,8,85.9


# Pull GOOG and MSFT

In [95]:
# Access the 'ticker' level from the index for filtering
GOOGDailyClose = nasdaq_stocks_last_5y_aggregated_close.loc[nasdaq_stocks_last_5y_aggregated_close.index.get_level_values('ticker') == "GOOG"]
MSFTDailyClose = nasdaq_stocks_last_5y_aggregated_close.loc[nasdaq_stocks_last_5y_aggregated_close.index.get_level_values('ticker') == "MSFT"]


GOOGDailyClose.info()

<class 'cudf.core.dataframe.DataFrame'>
MultiIndex: 1104 entries, ('GOOG', np.int16(2020), np.int16(1), np.int16(2)) to ('GOOG', np.int16(2024), np.int16(3), np.int16(13))
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   close   1104 non-null   float64
dtypes: float64(1)
memory usage: 62.0+ KB


In [96]:
GOOGDailyClose.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,close
ticker,year,month,day,Unnamed: 4_level_1
GOOG,2020,1,2,68.27
GOOG,2020,1,3,68.05
GOOG,2020,1,6,69.7
GOOG,2020,1,7,69.71
GOOG,2020,1,8,70.19


# Correlation_Coefficient Function

In [97]:
# Correlation_Coefficient Function
# Using the Pearson correlation coefficient to determine correlation between two stocks

def Correlation_Coefficient(StockA, StockB):
  # Calculate the correlation between the 'close' columns
  StockA = StockA['close'].droplevel('ticker')
  StockB = StockB['close'].droplevel('ticker')
  correlation = StockA.corr(StockB)
  return correlation


correlation_value = Correlation_Coefficient(GOOGDailyClose, MSFTDailyClose)
print(f"The Pearson correlation coefficient between GOOG and MSFT is: {correlation_value}")

The Pearson correlation coefficient between GOOG and MSFT is: 0.8976408107388537


## Example how to locate in dataframes and in sereies

In [98]:
# GOOGDailyClose_Series = GOOGDailyClose['close']

# x = GOOGDailyClose.loc[('GOOG', 2020, 1, 2), 'close']
# print(x)

# y = GOOGDailyClose_Series.loc[('GOOG', 2020, 1, 2)]
# print(y)

## Example graph

In [99]:
# # from matplotlib import pyplot as plt
# # GOOGDailyClose['close'].plot(kind='line', figsize=(8, 4), title='close')
# # plt.gca().spines[['top', 'right']].set_visible(False)

# from matplotlib import pyplot as plt

# # Create the plot for GOOGDailyClose
# ax = GOOGDailyClose['close'].plot(kind='line', figsize=(8, 4), title='Closing Stock Prices')

# # Add the plot for MSFTDailyClose to the same axes
# MSFTDailyClose['close'].plot(kind='line', ax=ax)

# # Hide the top and right spines
# plt.gca().spines[['top', 'right']].set_visible(False)

# # Add a legend to distinguish the lines
# plt.legend(['GOOG', 'MSFT'])

# Example Call Function from another file
Eample Call Correlation_Coefficient function from another My Drive file

In [100]:
# %run "/content/drive/My Drive/Colab Notebooks/Correlation_Coefficient.ipynb"

# Correlation_Coefficient(GOOGDailyClose, MSFTDaily Close)
