In [3]:
import ta
import pandas as pd
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Helper functions

def check_direction(row):
    if row['5D_Price'] > row['Close']:
        val = 1 #go up
    else:
        val = 0 #stay / go down
    return val

def check_yesterday(row):
    try:
        if row['Close']/row['Day_Before_Price'] > 1.01 :
            return 1
        elif row['Close']/row['Day_Before_Price'] < 0.99:
            return -1
        else:
            return 0
    except:
        return 0
    
def check_bollinger(row):
    if row['Close'] > row['bb_bbm']:
        a = row['Close'] - row['bb_bbm']
        b = row['bb_bbh'] - row['Close']
        
        if b > a:
            return 1
        else:
            return 2
    else:
        a = row['bb_bbm'] - row['Close']
        b = row['Close'] - row['bb_bbl']
        
        if b > a:
            return 1
        else:
            return 0
        
def check_consecutive(df):
    close = list(df['Close'])
    count = 0
    count_list = [0,0,0,0,0]
    for i in range(5,len(close)):
        if close[i] > close[i-1]:
            if count >= 0:
                
                count += 1
                count_list.append(count)
            else:
                count = 1
                count_list.append(count)
        else:
            if count <= 0:
                
                count -= 1
                count_list.append(count)
            else:
                count = -1
                count_list.append(count)
    return count_list

def manipulate_testdf(test_df, clf, xgboost = False):
    df_copy = test_df.copy()
    
    df_copy = df_copy.drop(columns=['5D_Price','Volume', 'bb_bbm','Date','Day_Before_Price','Adj Close', 'bb_bbh','bb_bbl','bb_bbw','Open','High','Low','Close'])
    print(df_copy)
    X = df_copy.loc[:, df.columns != 'Direction']
    num_col = len(X.columns)
    print(X)
#    if xgboost:
#        X.columns = [f'f{i}' for i in range(num_col)]
    
    y_pred = clf.predict(X)

    test_df['Predictions'] = list(y_pred)
    
    return test_df 

def check_rsi(rsi_list):
    '''
    append 1 if rsi increases from prev day
    append 0 if rsi drops from prev day
    '''
    return_list = []
    for i in range(2, len(rsi_list)):
        today = rsi_list[i-1]
        yesterday = rsi_list[i-2]
        
        if abs(today - yesterday) > 10:
            if rsi_list[i-1] > rsi_list[i-2]:
                return_list.append(1)
            else:
                return_list.append(-1)
        else:
            return_list.append(0)
            
    return return_list

def manipulate_df(df, drop = True):
# Initialize Bollinger Bands Indicator

    df['5D_Price'] = df['Close'].shift(-7) #see 5 days later price
    df['Day_Before_Price'] = df['Close'].shift(1) #see 1 day before price
    df['Direction'] = df.apply(check_direction, axis = 1)
    df['Yesterday'] = df.apply(check_yesterday, axis = 1)
    df['count'] = list(check_consecutive(df))
    
    indicator_bb = ta.volatility.BollingerBands(close=df["Close"], n=20, ndev=2)
    indicator_rsi = ta.momentum.RSIIndicator(close = df['Close'])
    indicator_adx = ta.trend.ADXIndicator(high = df['High'], low = df['Low'], close = df['Close'])
    indicator_ai = ta.trend.aroon_down(close = df['Close'])
    
    # Add Bollinger Bands features
    df['bb_bbm'] = indicator_bb.bollinger_mavg()
    df['bb_bbh'] = indicator_bb.bollinger_hband()
    df['bb_bbl'] = indicator_bb.bollinger_lband()

    rsi_temp = list(indicator_rsi.rsi())
    
    df['rsi'] = [0, 0] + check_rsi(rsi_temp)

    ## Add Bollinger Band high indicator
    df['bb_bbhi'] = indicator_bb.bollinger_hband_indicator()
    #
    ## Add Bollinger Band low indicator
    df['bb_bbli'] = indicator_bb.bollinger_lband_indicator()
    
    # Add width size Bollinger Bands
    df['bb_bbw'] = indicator_bb.bollinger_wband()
    #df['Bollinger_strength'] = df.apply(check_bollinger, axis = 1)
    if drop:
        df = df.drop(columns=['5D_Price','Volume', 'bb_bbm','Date','Day_Before_Price','Adj Close', 'bb_bbh','bb_bbl','bb_bbw','Open','High','Low','Close'])
    df = df.dropna()

    return df

# Note

To run the following, comment out the respective parts in clf_dict to choose the model to run. 

clf_dict = {
            # LGB
            'lgb' : [lgb.LGBMClassifier(), {'objective':['regression', 'binary'],
                     'num_leaves' : [2,100,1000]}], 

            #Random Forest
           'RandomForest': [RandomForestClassifier(), {'n_estimators' : [50,100]}],

            #Logistic Regression
           'LogisticRegression': [LogisticRegression(), {'penalty' : ['l1', 'l2']}],

            #XGBoost
           'xgboost' : [XGBClassifier(), {
                   'min_child_weight': [1, 5, 10],
                   'subsample': [0.8, 1.0],
                   'max_depth': [3, 5],
                   }
               ]
            }

In [13]:
df = pd.read_csv('nasdaq2.csv')
sentiments = pd.read_csv('ML_sentiments.csv')
sentiments_groupby = sentiments.groupby(['pub_date'], as_index = False)['polarity'].agg(sum)
sentiments_groupby.columns = ['Date', 'polarity']
df = pd.merge(df, sentiments_groupby, how = 'left', on = ['Date'])

num_rows = len(df)
test_df = df.iloc[5*num_rows//7 :]
df = df.iloc[:5*num_rows//7]



df = manipulate_df(df)
test_df = manipulate_df(test_df, drop = False)
X = df.loc[:, df.columns != 'Direction']
y = df['Direction']
    
profit_count = 0
for i in range(10):
    clf_dict = {
                # LGB
                'lgb' : [lgb.LGBMClassifier(), {'objective':['regression', 'binary'],
                         'num_leaves' : [2,100,1000]}], 
        
                #Random Forest
#                'RandomForest': [RandomForestClassifier(), {'n_estimators' : [50,100]}],
        
                #Logistic Regression
#                'LogisticRegression': [LogisticRegression(), {'penalty' : ['l1', 'l2']}],
        
                #XGBoost
#                'xgboost' : [XGBClassifier(), {
#                        'min_child_weight': [1, 5, 10],
#                        'subsample': [0.8, 1.0],
#                        'max_depth': [3, 5],
#                        }
#                    ]
                }
    print_list = []
    y_list = []
    best_clf = 0
    best_acc = 0
    avg_list = []
    for key, clf in clf_dict.items():
        acc_list = []
        
        
        for i in range(10):
            
            if key == 'xgboost':
                xgb = XGBClassifier( n_estimators=600, objective='binary:logistic',
                        silent=True, nthread=1)
                
                params = {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5], 
            'learning_rate':[0.02,0.1,1]
            }
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    #            X_train.columns = [f'f{i}' for i in range(len(X_train))]
    #            X_test.columns = [f'f{i}' for i in range(len(X_test))]
                folds = 3
                param_comb = 5
                
                skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
                
                random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=99 )
                
                # start_time = timer(None) # timing starts from this point for "start_time" variable
                random_search.fit(X, y)
                
                xgbnew = random_search.best_estimator_
                xgbnew.fit(X_train, y_train)
                # make predictions for test data
                y_pred = xgbnew.predict(X_test)
                predictions = [round(value) for value in y_pred]
                # evaluate predictions
                acc = accuracy_score(y_test, predictions)
                print("Accuracy: %.2f%%" % (acc * 100.0))
                if 0 in list(y_pred): #only take in classifiers that predicts at least a 0
                    if acc > best_acc:
                        best_acc = acc
                        best_clf = xgbnew
            
            else:
            #running it 20 times for better accuracy

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        
                clf2 = GridSearchCV(clf[0], clf[1], scoring = 'accuracy')
                clf2.fit(X_train, y_train)
   
                y_pred = clf2.predict(X_test)
                y_list.append(y_pred)
                acc = accuracy_score(y_test, y_pred)
                if 0 in list(y_pred): #only take in classifiers that predicts at least a 0
                    if acc > best_acc:
                        best_acc = acc
                        best_clf = clf2
            
            acc_list.append(acc)
        score = np.mean(acc_list)
        avg_list.append(score)
        
        print_list.append(f'{key} got a score of {score}')
    
    
    for i in print_list: #Print accuracy scores
        print(i)



    try:
        if key == 'xgboost':
            test_df = manipulate_testdf(test_df, best_clf, xgboost = True)
        else:
            test_df = manipulate_testdf(test_df, best_clf)
        a = test_df[test_df['Predictions'] == 0]
        b = test_df[test_df['Predictions'] == 1]
        a['Profits'] = a['Adj Close'] - a['5D_Price']
        b['Profits'] = b['5D_Price'] - b['Adj Close']
        profits = [i for i in list(a.Profits)]
        profitsb = [i for i in list(b.Profits)]
        print('Total profit will be:', sum(profits)) #final profit
        profit_count += sum(profits)
        profit_count += sum(profitsb)
        
    except:
        profit_count += 0
        
    
print(f'Average Profit made is {profit_count/10}')

lgb got a score of 0.6011673151750972
      polarity  Direction  Yesterday  count  rsi  bb_bbhi  bb_bbli
1219  0.418019          0          0      0    0      0.0      0.0
1220  1.113030          0          1      0   -1      0.0      0.0
1221  0.209091          0          0      0    1      0.0      0.0
1222  2.262256          0          0      0   -1      0.0      0.0
1223  4.687037          0          0      1   -1      0.0      0.0
...        ...        ...        ...    ...  ...      ...      ...
1694  1.072619          1          0      3    0      1.0      0.0
1695  0.896526          1          0      4    0      1.0      0.0
1696  3.471402          1          0      5    0      1.0      0.0
1697  1.879167          1          0      6    0      0.0      0.0
1698  3.654113          1          0      7    0      1.0      0.0

[480 rows x 7 columns]
      polarity  Yesterday  count  rsi  bb_bbhi  bb_bbli
1219  0.418019          0      0    0      0.0      0.0
1220  1.113030        

lgb got a score of 0.6408560311284047
      polarity  Direction  Yesterday  count  rsi  bb_bbhi  bb_bbli  \
1219  0.418019          0          0      0    0      0.0      0.0   
1220  1.113030          0          1      0   -1      0.0      0.0   
1221  0.209091          0          0      0    1      0.0      0.0   
1222  2.262256          0          0      0   -1      0.0      0.0   
1223  4.687037          0          0      1   -1      0.0      0.0   
...        ...        ...        ...    ...  ...      ...      ...   
1694  1.072619          1          0      3    0      1.0      0.0   
1695  0.896526          1          0      4    0      1.0      0.0   
1696  3.471402          1          0      5    0      1.0      0.0   
1697  1.879167          1          0      6    0      0.0      0.0   
1698  3.654113          1          0      7    0      1.0      0.0   

      Predictions  
1219            1  
1220            1  
1221            1  
1222            1  
1223            1  
.