In [None]:
#Important library imports...
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import preprocessing
from numpy import set_printoptions
%matplotlib inline


crypto_symbols = ['BTC-USD', 'ETH-USD', 'XRP-USD', 'LTC-USD', 'BCH-USD', 'ADA-USD', 'XLM-USD', 'XEM-USD', 'DOGE-USD',
                      'EOS-USD', 'XMR-USD', 'TRX-USD', 'MIOTA-USD', 'DASH-USD', 'BNB-USD', 'NEO-USD', 'ETC-USD',
                      'XTZ-USD', 'ZEC-USD', 'VET-USD', 'OMG-USD', 'ONT-USD', 'BAT-USD', 'QTUM-USD', 'ZRX-USD', 'ICX-USD',
                      'LSK-USD', 'BTG-USD', 'NANO-USD', 'DCR-USD', 'XVG-USD', 'BCD-USD', 'DGB-USD', 'STEEM-USD', 'STRAT-USD',
                      'SC-USD', 'ARDR-USD', 'LRC-USD', 'NEXO-USD', 'BTS-USD', 'REP-USD', 'WAVES-USD', 'QASH-USD', 'CENNZ-USD',
                      'ETP-USD', 'BCN-USD', 'THETA-USD', 'RDD-USD', 'KNC-USD', 'WAXP-USD', 'GNT-USD', 'ANT-USD', 'XIN-USD',
                      'POWR-USD', 'META-USD', 'ENG-USD', 'ELF-USD', 'DENT-USD', 'SNT-USD', 'RDN-USD', 'SYS-USD', 'AE-USD',
                      'PIVX-USD', 'GAS-USD', 'CMT-USD', 'SRN-USD', 'GTO-USD', 'MLN-USD', 'FUN-USD', 'MTL-USD', 'ARK-USD',
                      'STORM-USD', 'CVC-USD', 'PART-USD', 'BNT-USD', 'GAME-USD', 'STEEM-USD', 'GRS-USD', 'NXS-USD', 'FCT-USD',
                      'SALT-USD', 'POE-USD', 'ADX-USD', 'STX-USD', 'NAV-USD', 'POLY-USD', 'HPB-USD', 'PPC-USD', 'GAS-USD',
                      'RLC-USD', 'NEBL-USD', 'STORJ-USD', 'SKY-USD', 'ANT-USD', 'LBC-USD', 'BURST-USD', 'BLOCK-USD', 'SKY-USD']


In [None]:
#Data read in...
crypto_data = pd.read_csv('./complete_dataset.csv')
crypto_data.head()

In [None]:
#Ensure the dataset is properly arranged by cryptocurrency and datetime stamp.
df = crypto_data.sort_values(by=['Symbol', 'DateTime'])
df.head()

In [None]:
#veiwing other portions of the data and the other charcteristics
crypto_data.isnull().sum()

PREPROCESSING

In [None]:
#Create a different column for date to avoid kwarg error, convert string element to date time and set as index

df['DT'] = pd.to_datetime(df['DateTime'])
df.set_index('DT', inplace=True)



In [None]:
#removing the redundant columns
df.drop(labels=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'DateTime'],
    axis = 1,
    inplace= True)


In [None]:
#Let review some datetime characteristics
print('Earliest date is {} and Latest Date is {}'.format(df.index.min(),df.index.max()))
time_span = df.index.max() - df.index.min()
print('The time span of the dataset is {}'.format(time_span))

In [None]:
'''This represent 20 months of historic data ensuring recency of the data while avoiding the impact of COVID-19 that 
impacted the cryptocurrency market and other financial markets in a haphazard manner keeping in mind that we may never be able
to model an even tlike a PANDEMIC into our predictive model. In other words, the data points of 2020 and early 2021 are 
considered as outliers'''
#to limit the data to a strict 20 month period, i would limit the data from 15th Dec 2021 to 15th July 2023


In [None]:
df_trim = df.loc['2021-12-15':'2023-07-15']
df_trim

In [None]:
df_trim.dtypes

## EXPLORATORY DATA  ANALYSIS

In [None]:
#Lets take a look at the dataset as a whole
np.set_printoptions(precision=3, threshold=75)
df_trim.describe()

# Univariate Analysis

### Univariate Analysis using the BTC subset

In [None]:
#Lets take a look at the BTC data.
df_BTC = df_trim.loc[df_trim['Symbol']== 'BTC-USD']
df_BTC.head()

In [None]:
# Resample the data to a monthly frequency and calculate the mean for each month
df_BTC_monthly = df_BTC.resample('M').mean()

plt.figure(figsize=(11,6))

# Create a line plot using Seaborn
sns.lineplot(data=df_BTC_monthly, 
             x=df_BTC_monthly.index.map(lambda x: x.strftime('%m-%y')),
             y='Adj Close'
          )
plt.xticks(df_BTC_monthly.index.map(lambda x: x.strftime('%m-%y')), rotation='40')

plt.show()

In [None]:
df_BTC.shape

In [None]:
df_BTC.isnull().sum()

In [None]:
df_BTC.dtypes

In [None]:
#Lets create  target values(closing value of next day and a difference or gain) in the dataset 
df_BTC_2 = df_BTC.assign(close_next = df_BTC['Open'].shift(-1),
                     diff= df_BTC['Adj Close']-df_BTC['Open']).drop('Symbol', axis=1)
df_BTC_2
#This will create 2 output features >>>>>Close_next >> & >>>>>Diff

In [None]:
#in order to determine if the data has any trends, a simple plot of each column is used.
df_BTC_2.plot(subplots= True, figsize=(10,14))

In [None]:
#Exploring the features most predictive of the target values
plt.figure(figsize=(10, 6)) 
df_BTC_corr = round(df_BTC_2.corr(),3)
sns.heatmap(df_BTC_corr, annot= True, cmap = 'crest')

In [None]:
for i,column in enumerate(df_BTC_2.columns):
    sns.histplot(df_BTC_2[column],kde=True, bins=30)  # Create the histogram using Seaborn
    plt.title(f'Histogram of {column} amount')  # Add a title for each histogram
#   plt.xlabel({column})  # Add x-axis label (column name)
    plt.ylabel('Frequency')  # Add y-axis label
    plt.tight_layout()
    plt.show()  # Display the histogram
# fig,((ax0,ax1), (ax2,ax3), (ax4,ax5),(ax6,ax7)) = plt.subplots(nrows=4, ncols=2)

# fig,axes = plt.subplots(nrows=4, ncols=2, figsize=(10,12))

# for i, columns in enumerate(df_BTC_2.columns):
#     row = i // 2
#     col = i % 2
    
#     ax = axes[row,col]
#     ax.hist(df_BTC_2[column], bins=50)
#     ax.set_title(f'Histogram of {column} amount')  # Add a title for each histogram
#     ax.set_ylabel('Frequency')  # Add y-axis label
    
# plt.tight_layout()
# plt.show()  # Display the histogram

In [None]:
#We need to find a way  to generalize these characteristics to the larger data that is 
# that is, the possibility that all the cryptocurrencies in the larger dataset follow these characteristics.


In [None]:
#In order to explore the data properly each cryptocurrency needs to be explored independently.

In [None]:
#Seperate the dataset in Training and testing datasets.

In [None]:
# if (df['Symbol']==df['Symbol'].shift()).all():
#     df['previous_close'] = df.shift(1)['Adj Close'] 
# df.head()

In [None]:
# for i, row in crypto_data:
#     if crypto_data(i)['Symbol'] == crypto_data.shift(i)['Symbol']:
#         crypto_data['Diff_Close'] = crypto_data.shift(i)['Adj Close'] - crypto_data(i)['Adj Close']

# crypto_data['Diff_Close'] = crypto_data.groupby('Symbol')['Adj Close'].diff().shift(-1)

In [None]:
# df['GainOrLoss'] = df['Adj Close'] - df['previous_close']
# df[500:700:10]

In [None]:
# df.shape

In [None]:
# The pre-processing of the data would involve extensive extensive data mungling and feature enginnering
# Hence data splitting would be done early to avoid spilling future data in to the past.


In [None]:
# BTC_df = (df[df['Symbol']=='BTC-USD']).sort_values(by='DateTime', ascending=False )

# plt.figure(figsize=(12,10))
    
# ax1= sns.lineplot(x=BTC_df.index,
#     y='Adj Close',
#     data=BTC_df,        
#     linewidth= 3,
#     sort=True)
# plt.show()

In [None]:
# !pip install sktime
# from sktime import plot_series