
*   Computing Platforms: Set up the Workspace for Machine Learning Projects.  https://ms.pubpub.org/pub/computing
*  Machine Learning for Predictions. https://ms.pubpub.org/pub/ml-prediction
* Machine Learning Packages: https://scikit-learn.org/stable/


# Part I: Import and Inspect Data

In [19]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [20]:
df = pd.read_csv('https://raw.githubusercontent.com/Rising-Stars-by-Sunshine/stats201-tutorial-prediction/main/data/Queried_Data/queried_data.csv',index_col="Unnamed: 0")
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit
100,14650515,2022-04-25 00:00:04,0,30000000
101,14650516,2022-04-25 00:00:07,3067277,29970705
102,14650517,2022-04-25 00:00:09,29927116,29941438
103,14650518,2022-04-25 00:00:35,29951281,29970676
104,14650519,2022-04-25 00:00:38,15598681,29999943


# Part II: Prepare the Y varible for Regression

## 2.1. Write functions to calculte the Y variable for Regression 

*(skip the step if the Y variable already exists)*

In [21]:
df['theta'] = df['gas_used']/df['gas_limit']
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit,theta
100,14650515,2022-04-25 00:00:04,0,30000000,0.0
101,14650516,2022-04-25 00:00:07,3067277,29970705,0.102343
102,14650517,2022-04-25 00:00:09,29927116,29941438,0.999522
103,14650518,2022-04-25 00:00:35,29951281,29970676,0.999353
104,14650519,2022-04-25 00:00:38,15598681,29999943,0.519957


## 2.2. Make Sure that the Data Type of Y is "numeric"

In [22]:
df.dtypes

number         int64
timestamp     object
gas_used       int64
gas_limit      int64
theta        float64
dtype: object

In [23]:
df['theta'] = pd.to_numeric(df['theta'])
df.dtypes

number         int64
timestamp     object
gas_used       int64
gas_limit      int64
theta        float64
dtype: object

# Part III: Prepare the Y variable for Classification

reference:

https://datatofish.com/if-condition-in-pandas-dataframe/ *italicized text*

In [24]:
#@title Define the Congestion Threshold
cut = 0.95 #@param {type:"number"}


In [25]:
df['congested'] = df['theta'] >= cut
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit,theta,congested
100,14650515,2022-04-25 00:00:04,0,30000000,0.0,False
101,14650516,2022-04-25 00:00:07,3067277,29970705,0.102343,False
102,14650517,2022-04-25 00:00:09,29927116,29941438,0.999522,True
103,14650518,2022-04-25 00:00:35,29951281,29970676,0.999353,True
104,14650519,2022-04-25 00:00:38,15598681,29999943,0.519957,False


In [26]:
df.loc[(df['theta'] >= cut), 'congested'] = 1
df.loc[(df['theta'] <cut), 'congested'] = 0
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit,theta,congested
100,14650515,2022-04-25 00:00:04,0,30000000,0.0,0
101,14650516,2022-04-25 00:00:07,3067277,29970705,0.102343,0
102,14650517,2022-04-25 00:00:09,29927116,29941438,0.999522,1
103,14650518,2022-04-25 00:00:35,29951281,29970676,0.999353,1
104,14650519,2022-04-25 00:00:38,15598681,29999943,0.519957,0


# 3.3. Method 3: Lambda function

In [27]:
df['congested'] = df['theta'].apply(lambda x: 1 if x>= cut else 0)
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit,theta,congested
100,14650515,2022-04-25 00:00:04,0,30000000,0.0,0
101,14650516,2022-04-25 00:00:07,3067277,29970705,0.102343,0
102,14650517,2022-04-25 00:00:09,29927116,29941438,0.999522,1
103,14650518,2022-04-25 00:00:35,29951281,29970676,0.999353,1
104,14650519,2022-04-25 00:00:38,15598681,29999943,0.519957,0


## 3.2. Method 2: Cut function

reference: 

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html

In [28]:
df.head()

Unnamed: 0,number,timestamp,gas_used,gas_limit,theta,congested
100,14650515,2022-04-25 00:00:04,0,30000000,0.0,0
101,14650516,2022-04-25 00:00:07,3067277,29970705,0.102343,0
102,14650517,2022-04-25 00:00:09,29927116,29941438,0.999522,1
103,14650518,2022-04-25 00:00:35,29951281,29970676,0.999353,1
104,14650519,2022-04-25 00:00:38,15598681,29999943,0.519957,0


In [32]:
import numpy as np
 
congested = pd.cut(df['theta'], bins=[0,0.95,1], labels=[0,1]) #might have problems at boundaries
df.insert(3, 'congested2',congested)
df.head()

Unnamed: 0,number,timestamp,gas_used,congested2,gas_limit,theta,congested
100,14650515,2022-04-25 00:00:04,0,,30000000,0.0,0
101,14650516,2022-04-25 00:00:07,3067277,0.0,29970705,0.102343,0
102,14650517,2022-04-25 00:00:09,29927116,1.0,29941438,0.999522,1
103,14650518,2022-04-25 00:00:35,29951281,1.0,29970676,0.999353,1
104,14650519,2022-04-25 00:00:38,15598681,0.0,29999943,0.519957,0


# Part III: Create the X variables

## 3.1. Shift the Y to get past values

reference:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html

In [34]:
df['theta_past'] =df['theta'].shift(1)
df.head()

Unnamed: 0,number,timestamp,gas_used,congested2,gas_limit,theta,congested,theta_past
100,14650515,2022-04-25 00:00:04,0,,30000000,0.0,0,
101,14650516,2022-04-25 00:00:07,3067277,0.0,29970705,0.102343,0,0.0
102,14650517,2022-04-25 00:00:09,29927116,1.0,29941438,0.999522,1,0.102343
103,14650518,2022-04-25 00:00:35,29951281,1.0,29970676,0.999353,1,0.999522
104,14650519,2022-04-25 00:00:38,15598681,0.0,29999943,0.519957,0,0.999353


## 3.2. Calculate the Moving Averages

references: 
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

In [36]:
#@title Define the Window
window = 10 #@param {type:"number"}


In [37]:
df['theta_past_ma10']=df['theta_past'].rolling(window=window).mean()
df.head(20)

Unnamed: 0,number,timestamp,gas_used,congested2,gas_limit,theta,congested,theta_past,theta_past_ma10
100,14650515,2022-04-25 00:00:04,0,,30000000,0.0,0,,
101,14650516,2022-04-25 00:00:07,3067277,0.0,29970705,0.102343,0,0.0,
102,14650517,2022-04-25 00:00:09,29927116,1.0,29941438,0.999522,1,0.102343,
103,14650518,2022-04-25 00:00:35,29951281,1.0,29970676,0.999353,1,0.999522,
104,14650519,2022-04-25 00:00:38,15598681,0.0,29999943,0.519957,0,0.999353,
105,14650520,2022-04-25 00:00:47,10844553,0.0,30000000,0.361485,0,0.519957,
106,14650521,2022-04-25 00:00:52,7476517,0.0,30000000,0.249217,0,0.361485,
107,14650522,2022-04-25 00:00:56,0,,30000000,0.0,0,0.249217,
108,14650523,2022-04-25 00:00:57,18525539,0.0,30000000,0.617518,0,0.0,
109,14650524,2022-04-25 00:01:01,10934632,0.0,30000000,0.364488,0,0.617518,


# Part IV Train and Test Split

In [40]:
from sklearn.model_selection import TimeSeriesSplit

In [41]:
tss = TimeSeriesSplit()
for train_idx, test_idx in tss.split(df):
    pass

train_df = df.filter(items=train_idx, axis=0)
test_df =  df.filter(items=test_idx, axis=0)

In [42]:
train_df.head()

Unnamed: 0,number,timestamp,gas_used,congested2,gas_limit,theta,congested,theta_past,theta_past_ma10
0,14650615,2022-04-25 00:22:41,3788742,0,29970705,0.126415,0,0.991364,0.557929
1,14650616,2022-04-25 00:22:43,29979945,1,29999972,0.999332,1,0.126415,0.562116
2,14650617,2022-04-25 00:23:35,29962455,1,29970677,0.999726,1,0.999332,0.56206
3,14650618,2022-04-25 00:23:42,29979756,1,29999944,0.999327,1,0.999726,0.597107
4,14650619,2022-04-25 00:23:56,23281823,0,30000000,0.776061,0,0.999327,0.675015


In [43]:
test_df.head()

Unnamed: 0,number,timestamp,gas_used,congested2,gas_limit,theta,congested,theta_past,theta_past_ma10
52875,14703290,2022-05-03 07:21:51,29913769,1.0,29970676,0.998101,1,0.999458,0.579359
52876,14703291,2022-05-03 07:22:47,5183307,0.0,29999943,0.172777,0,0.998101,0.625504
52877,14703292,2022-05-03 07:22:59,29986301,1.0,30000000,0.999543,1,0.172777,0.633738
52878,14703293,2022-05-03 07:23:03,0,,30000000,0.0,0,0.999543,0.70803
52879,14703294,2022-05-03 07:23:04,29965217,1.0,29970705,0.999817,1,0.0,0.60809


# Part IV Prepare the Train and Test Data for Classification and Regression

In [39]:
df_C =df[['congested','theta_past_ma10']]
df_R =df[['theta','theta_past_ma10']]

In [None]:
# please complete the code on your own
df_C_train=
df_C_test = 

In [None]:
# please complete the code on your own
df_R_train=
df_R_test = 

In [None]:
#export and save the data to the processed data folder, please complete the code on your own