<a href="https://colab.research.google.com/github/NjokiM/UON_CS/blob/main/Crypto_Linear_Regression_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**For the Crypto dataset, the task is to predict the price. **

🔑*** Task steps***
1. Importing libraries
2. Data upload
3. EDA: Exploratory Data analysis- feature modification, engineering, encoding
4. Model creation: 
*   Model definition & compiling
*   Predicting
*   Evaluation


---



In [None]:
#getting libraries
import os
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
#connecting to google drive 
from google.colab import drive
drive.mount('/content/gdrive')
%cd 'gdrive/MyDrive'



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive


In [None]:
#setting paths and creating a working folder
file_loc = "cp_raw/"
# preparing to download
if not os.path.exists(file_loc):
    try:
        os.mkdir(file_loc)
    except OSError:
        print("Creation of the directory %s failed" % file_loc)
    else:
        print("Successfully created the directory %s " % file_loc)
!pwd 
%cd cp_raw     

/content/gdrive/MyDrive
/content/gdrive/MyDrive/cp_raw


In [None]:
#data download
# source_url = 'https://drive.google.com/file/d/11ZN1iNEQ0QrfE7VJdj9y5B36LPaFjfoa/view?usp=drive_web&authuser=3'
# cp_file = 'consolidated_cryptocoin_data.csv'
# cp_url = source_url +"/"+ cp_file

# !wget -N {cp_url} -P {file_loc}
# !pwd



In [None]:
#import necessary library to authorize access to Google Sheet
cp_data= pd.read_excel('/content/gdrive/MyDrive/cp_raw/consolidated_cryptocoin_data.xlsx')
cp_data

Unnamed: 0,Currency,Date,Open,High,Low,Close,Volume,Market Cap
0,ripple,2019-04-24,0.321114,0.321282,0.296982,0.302318,1517791002.0,12698877293.0
1,ripple,2019-04-23,0.323844,0.328396,0.320919,0.321222,1077333990.0,13492933875.0
2,ripple,2019-04-22,0.322277,0.329350,0.320237,0.323934,1131094080.0,13606823301.0
3,ripple,2019-04-21,0.328678,0.329627,0.318746,0.322449,1005803846.0,13533407430.0
4,ripple,2019-04-20,0.331871,0.333213,0.324969,0.328476,931570799.0,13786384592.0
...,...,...,...,...,...,...,...,...
13571,cardano,2017-10-05,0.021951,0.022154,0.020859,0.021489,5562510.0,557139041.0
13572,cardano,2017-10-04,0.020864,0.022806,0.020864,0.021931,9000050.0,568619548.0
13573,cardano,2017-10-03,0.025757,0.027425,0.020690,0.020816,16997800.0,539692715.0
13574,cardano,2017-10-02,0.024607,0.030088,0.019969,0.025932,57641300.0,628899052.0


📓**Exploratory Data Analysis**

Understanding and modifying the vqarious data features.
Modification is needed to transform the features into numerical form for use in machine learning model to be used

In [None]:
cp_data_clms = list(cp_data.columns)
cp_data_clms

['Currency', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap']

In [None]:
#working with a copy of original data
cp_data_wk = cp_data.copy()


✅**Editting the date feature**
Date feature split into year, months and day, as in current state date data is difficult to work with.
Also to evaluate whether there's one that impacts the label *italicized text*

In [None]:
# viewing a field inorder to edit it
cp_data_wk['Date'] 

0       2019-04-24
1       2019-04-23
2       2019-04-22
3       2019-04-21
4       2019-04-20
           ...    
13571   2017-10-05
13572   2017-10-04
13573   2017-10-03
13574   2017-10-02
13575   2017-10-01
Name: Date, Length: 13576, dtype: datetime64[ns]

In [None]:
#splitting the date feature and adding the new features to the data
cp_data_wk['Date'] = pd.to_datetime(cp_data_wk['Date'], format = '%Y-%m-%dT%H:%M:%SZ', errors = 'coerce')
cp_data_wk['Year'] = cp_data_wk['Date'].dt.year
cp_data_wk['Month'] = cp_data_wk['Date'].dt.month
cp_data_wk['Day'] = cp_data_wk['Date'].dt.day

In [None]:
#viewing the data as modified
cp_data_wk

Unnamed: 0,Currency,Date,Open,High,Low,Close,Volume,Market Cap,Year,Month,Day
0,ripple,2019-04-24,0.321114,0.321282,0.296982,0.302318,1517791002.0,12698877293.0,2019,4,24
1,ripple,2019-04-23,0.323844,0.328396,0.320919,0.321222,1077333990.0,13492933875.0,2019,4,23
2,ripple,2019-04-22,0.322277,0.329350,0.320237,0.323934,1131094080.0,13606823301.0,2019,4,22
3,ripple,2019-04-21,0.328678,0.329627,0.318746,0.322449,1005803846.0,13533407430.0,2019,4,21
4,ripple,2019-04-20,0.331871,0.333213,0.324969,0.328476,931570799.0,13786384592.0,2019,4,20
...,...,...,...,...,...,...,...,...,...,...,...
13571,cardano,2017-10-05,0.021951,0.022154,0.020859,0.021489,5562510.0,557139041.0,2017,10,5
13572,cardano,2017-10-04,0.020864,0.022806,0.020864,0.021931,9000050.0,568619548.0,2017,10,4
13573,cardano,2017-10-03,0.025757,0.027425,0.020690,0.020816,16997800.0,539692715.0,2017,10,3
13574,cardano,2017-10-02,0.024607,0.030088,0.019969,0.025932,57641300.0,628899052.0,2017,10,2


✅**Encoding the Currency feature**

The currency field has multiple currencies in string format thus this will require the types to be turned into a numerical value.

Bitcoin is also available 3 times, thus the three will be made into one type

In [None]:
#encoding the currency field
types = cp_data_wk['Currency'].unique()
types

array(['ripple', 'binance-coin', 'eos', 'bitcoin', 'tether',
       'bitcoin-cash', 'stellar', 'litecoin', 'ethereum', 'cardano'],
      dtype=object)

In [None]:
cp_data_wk = cp_data_wk.replace('binance-coin', 'bitcoin')
cp_data_wk = cp_data_wk.replace('bitcoin-cash', 'bitcoin')
column1 = ({'Currency': types})
print(column1)
print("*" * 30)
column_en = pd.get_dummies(cp_data_wk[column1])
cp_data_wk.drop(['Currency'],axis=1, inplace=True)
cp_data_wk= cp_data_wk.drop(['Date'], axis=1)
print("The transform data using get_dummies")
print(cp_data_wk.head())

{'Currency': array(['ripple', 'binance-coin', 'eos', 'bitcoin', 'tether',
       'bitcoin-cash', 'stellar', 'litecoin', 'ethereum', 'cardano'],
      dtype=object)}
******************************
The transform data using get_dummies
       Open      High       Low     Close        Volume     Market Cap  Year  \
0  0.321114  0.321282  0.296982  0.302318  1517791002.0  12698877293.0  2019   
1  0.323844  0.328396  0.320919  0.321222  1077333990.0  13492933875.0  2019   
2  0.322277  0.329350  0.320237  0.323934  1131094080.0  13606823301.0  2019   
3  0.328678  0.329627  0.318746  0.322449  1005803846.0  13533407430.0  2019   
4  0.331871  0.333213  0.324969  0.328476   931570799.0  13786384592.0  2019   

   Month  Day  
0      4   24  
1      4   23  
2      4   22  
3      4   21  
4      4   20  


In [None]:
#final dataset after encoding and cleaning
cp_data_en = pd.concat([cp_data_wk,column_en],axis=1)
cp_data_en = cp_data_en.replace('-',0)
cp_data_en['Volume'] = cp_data_en['Volume'].astype(float)
cp_data_en['Market Cap'] = cp_data_en['Market Cap'].astype(float)

cp_data_en

Unnamed: 0,Open,High,Low,Close,Volume,Market Cap,Year,Month,Day,Currency_bitcoin,Currency_cardano,Currency_eos,Currency_ethereum,Currency_litecoin,Currency_ripple,Currency_stellar,Currency_tether
0,0.321114,0.321282,0.296982,0.302318,1.517791e+09,1.269888e+10,2019,4,24,0,0,0,0,0,1,0,0
1,0.323844,0.328396,0.320919,0.321222,1.077334e+09,1.349293e+10,2019,4,23,0,0,0,0,0,1,0,0
2,0.322277,0.329350,0.320237,0.323934,1.131094e+09,1.360682e+10,2019,4,22,0,0,0,0,0,1,0,0
3,0.328678,0.329627,0.318746,0.322449,1.005804e+09,1.353341e+10,2019,4,21,0,0,0,0,0,1,0,0
4,0.331871,0.333213,0.324969,0.328476,9.315708e+08,1.378638e+10,2019,4,20,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13571,0.021951,0.022154,0.020859,0.021489,5.562510e+06,5.571390e+08,2017,10,5,0,1,0,0,0,0,0,0
13572,0.020864,0.022806,0.020864,0.021931,9.000050e+06,5.686195e+08,2017,10,4,0,1,0,0,0,0,0,0
13573,0.025757,0.027425,0.020690,0.020816,1.699780e+07,5.396927e+08,2017,10,3,0,1,0,0,0,0,0,0
13574,0.024607,0.030088,0.019969,0.025932,5.764130e+07,6.288991e+08,2017,10,2,0,1,0,0,0,0,0,0


In [None]:
from sklearn.preprocessing import MinMaxScaler
cp_data_en['Volume'] = MinMaxScaler().fit_transform(np.array(cp_data_en['Volume']).reshape(-1,1))
cp_data_en['Market Cap'] = MinMaxScaler().fit_transform(np.array(cp_data_en['Market Cap']).reshape(-1,1))

In [None]:
cp_data_en_clmns = list(cp_data_en.columns)
cp_data_en_clmns
label = cp_data_en_clmns.pop(3)
print(label, cp_data_en_clmns)


Close ['Open', 'High', 'Low', 'Volume', 'Market Cap', 'Year', 'Month', 'Day', 'Currency_bitcoin', 'Currency_cardano', 'Currency_eos', 'Currency_ethereum', 'Currency_litecoin', 'Currency_ripple', 'Currency_stellar', 'Currency_tether']


In [None]:
# Split data into train/test
# p = training data portion
p=0.8
train_data = cp_data_en.sample(frac=p,random_state=0)
test_data = cp_data_en.drop(train_data.index)


In [None]:
#features to use
#train data
feat_1 = ['Open', 'High', 'Low', 'Volume', 'Market Cap', 'Year','Currency_binance-coin', 'Currency_bitcoin', 'Currency_bitcoin-cash', 'Currency_cardano', 'Currency_eos', 'Currency_ethereum', 'Currency_litecoin', 'Currency_ripple', 'Currency_stellar', 'Currency_tether']
X_tr = train_data[feat_1]
y_tr = train_data[label]
print(X_tr.head(), y_tr.head())

KeyError: ignored

In [None]:
#Test data
X_ts = test_data[feat_1]
y_ts = test_data[label]
print(X_tr.head(), y_tr.head())

In [None]:
# train-validation splitting the data using a random seed for reproducibility
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, random_state=0)

In [None]:
# use standard scale to standardize data (and later help explainability)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test= ss.transform(test_data[feat_1])


X_train = pd.DataFrame(X_train, columns = X_tr.columns)


In [None]:
# #set random seed
# tf.random.set_seed(42)

# # model definition
# model = tf.keras.Sequential([
#       tf.keras.layers.Dense(1)
#     ])

# #compile the model
# model.compile(loss = tf.keras.losses.mae,
#               optimizer = tf.keras.optimizers.SGD(),
#               metrics = ["mae"])

# using Lasso regression to train the model
model = LinearRegression()

# find the general performance of Lasso regression with cross_val_score
print(f'Cross validation R2 score: {round(cross_val_score(model, X_train, y_train).mean(), 4)}')


In [None]:
#fit the model
model.fit(X_train.values, y_train)

val_score = model.score(X_val, y_val)
print(f'Linear regression R2 on validation data: {round(val_score, 4)}')

In [None]:
#Evaluate Model
# getting the predictions from linear regression on the validation dataset to plot residuals
preds = model.predict(X_val)

# Looking at RMSE and MAE values to put into original units for expainability
rmse = mean_squared_error(y_val, preds) ** 0.5
mae = mean_absolute_error(y_val, preds)

print(f'Root Mean Squared Error: {round(rmse, 2)}')
print(f'Mean Absolute Error: {round(mae, 2)}')