In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import csv
from sklearn.linear_model import LinearRegression
import statsmodels.regression.linear_model as sm
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [9]:
# approach with lagged variables

# Load data 24h
all_sent = pd.read_csv("../augmento_BTC_24h.csv").sort_values("Date").reset_index(drop=True)
all_sent = all_sent.drop(columns="Date")


# Replace zeroes
all_sent = all_sent.replace(0, 0.00001)


# Make lagged variables
lag1 = all_sent.shift(1)
lag1 = np.log(lag1 / all_sent)

lag2 = all_sent.shift(2)
lag2 = np.log(lag2 / all_sent)
lag2.columns = lag2.columns + "_lag"


# Merge, remove lines with NA's
data = pd.merge(lag1, lag2, left_index=True, right_index=True)
data = data[2:]


# Define X and Y
Ydiff = data.BTC_Price
Xdiff = data.copy().drop(columns=["BTC_Price", "BTC_Price_lag"])


# Overview
print("The dataset has", len(data.axes[0]), "rows and", len(data.axes[1]), "columns\n")
print(data[["BTC_Price", "BTC_Price_lag", "Hacks", "Hacks_lag"]].head(), "\n\nVariables:\n")
print(Xdiff.columns)

The dataset has 1062 rows and 190 columns

   BTC_Price  BTC_Price_lag     Hacks  Hacks_lag
2   0.072965       0.057951  0.074108  -0.693147
3  -0.020878       0.052087 -0.479573  -0.405465
4  -0.000270      -0.021148  0.904456   0.424883
5  -0.011449      -0.011720  0.753772   1.658228
6   0.011862       0.000412 -0.318454   0.435318 

Variables:

Index(['BTC_Volume', 'Hacks', 'Pessimistic/Doubtful', 'Banks', 'Selling',
       'Market_manipulation', '(De-)centralisation', 'Angry', 'ETF',
       'Leverage',
       ...
       'Use_case/Applications_lag', 'Rumor_lag', 'Scam/Fraud_lag',
       'Airdrop_lag', 'Optimistic_lag', 'Negative_lag'],
      dtype='object', length=188)


In [16]:
model = LinearRegression().fit(Xdiff, Ydiff)

In [17]:
model.score(Xdiff, Ydiff)

0.4014918647075655

In [18]:
model.coef_

array([ 1.68110452e-02,  5.27695577e-03, -7.08589344e-03,  8.61100247e-03,
       -1.51031887e-02, -6.02770069e-04, -5.16919408e-03, -3.58767927e-03,
       -1.84732856e-03, -5.33101316e-04, -8.67210167e-03,  7.27000458e-03,
        1.81115445e-02, -1.94662210e-04, -1.22199586e-04, -3.09004710e-03,
       -1.02478615e-03, -1.13469953e-02, -2.38290763e-03, -3.58998502e-03,
       -2.90480098e-02, -3.57522640e-04, -1.08347887e-03, -2.25300120e-02,
        2.91390320e-03, -1.31131663e-03, -1.67903417e-03, -2.94286834e-02,
        1.07135206e-02, -1.16908511e-03,  9.88390212e-03, -1.41320263e-02,
       -3.89233693e-06, -2.82277579e-03,  8.20219935e-03,  1.15153389e-02,
        5.00106838e-03, -4.60073324e-03,  9.39681571e-04,  3.94666173e-02,
       -1.34630264e-02, -7.43294408e-04, -1.18016392e-03, -2.44553717e-02,
       -2.05566508e-03,  4.31735704e-03, -4.27278959e-04,  1.61905134e-02,
       -3.40337350e-03,  5.01915037e-03, -5.87027289e-03,  2.53991063e-02,
        5.34152580e-05, -

In [19]:
model.intercept_

-0.002224436348109358

In [20]:
mod = sm.OLS(Ydiff, Xdiff)
print(mod.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:              BTC_Price   R-squared (uncentered):                   0.401
Model:                            OLS   Adj. R-squared (uncentered):              0.272
Method:                 Least Squares   F-statistic:                              3.106
Date:                Sun, 02 Feb 2020   Prob (F-statistic):                    4.61e-29
Time:                        20:43:29   Log-Likelihood:                          2101.7
No. Observations:                1062   AIC:                                     -3827.
Df Residuals:                     874   BIC:                                     -2893.
Df Model:                         188                                                  
Covariance Type:            nonrobust                                                  
                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------

In [21]:
lasso = Lasso(alpha=0.0001, max_iter=100000000).fit(Xdiff, Ydiff)

In [22]:
lasso.score(Xdiff, Ydiff)

0.3636139173764933

In [29]:
lasso.coef_

array([ 9.79912933e-03,  4.83600690e-03, -0.00000000e+00,  1.16280352e-03,
       -6.80004949e-03, -3.14738534e-03, -5.26596413e-04, -0.00000000e+00,
       -1.73443874e-03, -0.00000000e+00, -9.37180414e-03,  4.97126862e-03,
        1.39357292e-02,  0.00000000e+00,  0.00000000e+00, -2.99396203e-03,
       -0.00000000e+00, -9.97926603e-03, -4.10428070e-04, -9.42398502e-04,
       -2.57202094e-02, -1.64648845e-04,  2.09956997e-03, -0.00000000e+00,
        0.00000000e+00, -9.18047898e-04, -0.00000000e+00, -3.92512038e-02,
        9.62093043e-03, -0.00000000e+00,  0.00000000e+00, -6.15239834e-03,
       -2.54700109e-05, -0.00000000e+00,  1.94148638e-03,  0.00000000e+00,
        5.59995061e-04,  0.00000000e+00,  2.93433682e-04,  4.37903192e-02,
       -4.56909523e-03, -0.00000000e+00, -0.00000000e+00, -1.05740469e-02,
       -9.51953792e-04,  3.45241855e-03,  1.83474954e-04,  8.03147688e-03,
       -1.47780914e-03, -0.00000000e+00, -0.00000000e+00,  1.42907985e-02,
        7.07529799e-05, -

In [30]:
lasso_cv = LassoCV(cv=10, random_state=0, max_iter=100000000).fit(Xdiff, Ydiff)

In [31]:
lasso_c = lasso_cv.coef_
tmp = pd.DataFrame(lasso_c)
tmp["var"] = Xdiff.columns
tmp.to_csv('lasso_coeff.csv')

In [32]:
lasso_c

array([ 0.00000000e+00,  3.11028116e-03,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -1.03042546e-03, -0.00000000e+00, -0.00000000e+00,
       -1.89368911e-03,  3.19917872e-04, -9.52960627e-03,  3.12261790e-03,
        1.37585679e-02,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -8.40008931e-03, -0.00000000e+00, -0.00000000e+00,
       -6.93174575e-03, -2.19771593e-06,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -6.48223865e-04,  0.00000000e+00, -3.87990816e-02,
        6.79460734e-03,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -1.43253461e-05, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  2.32804419e-05,  3.84873944e-02,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  4.34325426e-04,  0.00000000e+00,  3.62651910e-03,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  5.85735636e-03,
        0.00000000e+00, -