In [1]:
# Importing modules
%matplotlib inline
from __future__ import print_function
import pandas as pd
import numpy as np
import scipy.stats as stats
import sklearn
import random
import os
from pathlib import Path
from sklearn.linear_model import *
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report 
import pickle

In [60]:
def LoadSplitData(split):
    data_xy = LoadData("lrstem-binEncoding.csv")
    
    y = data_xy['market_moving']
    data_xy = data_xy.drop(columns= ['Unnamed: 0', 'market_moving'])
    print(data_xy.head())
    
    #drop the target variables you do not need (1-7), market moving or not ratings
    #colsxy= [0] #todo: adjust, this currently removes unnecessary y columns
    #data_xy = data_xy.drop(data_xy.columns[colsxy],axis=1)
    
    # separates out X & Y columns, but keeps article ID with X for split
    num_features = 300
    X = data_xy.drop(list(data_xy)[num_features:], axis=1)
    print(X.shape)
    
    #Get dummy test/train set 
    DummyX_train, DummyX_test, Dummyy_train, Dummyy_test = train_test_split(data_xy, y, test_size=split, random_state=42)
    
    return X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy

def LoadData(filename):
    DATA_DIR = "Data"
    ENCODING_DIR = os.path.join(DATA_DIR, filename)
    data = pd.read_csv(ENCODING_DIR)
    return data


In [61]:
X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy = LoadSplitData(0.30)


   article_id  inexpend  retail  kur  influx  sovidebloombergnet  fact  zip2  \
0           6         0       0    0       0                   1     0     0   
1          12         0       0    0       0                   0     0     0   
2          18         0       0    0       0                   0     1     0   
3          24         0       0    0       0                   0     1     0   
4          30         0       1    0       0                   0     0     0   

   whisk  greenwich   ...     cornel  leviton  katy  asc  peck  daytoday  \
0      0          0   ...          0        0     0    0     0         0   
1      0          0   ...          0        0     0    0     0         0   
2      0          0   ...          0        0     0    0     0         0   
3      0          0   ...          0        0     0    0     0         0   
4      0          0   ...          0        0     0    0     0         0   

   twenty  100bn  spoil  samarco  
0       0      0      0    

In [62]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: market_moving, dtype: int64

In [63]:
def all_metrics(y, y_hat):
    scores={}
    # takes in the actual score (y) and the prediction (y_hat)
    scores['average_precision_score'] = average_precision_score(y, y_hat)
    scores['accuracy_score']= accuracy_score(y, y_hat)
    scores['precision_score']= precision_score(y, y_hat)
    scores['recall_score'] = recall_score(y, y_hat)
    scores['f1_score'] =  f1_score(y, y_hat)
    scores['confusion_matrix'] = confusion_matrix(y, y_hat)
    scores['classification_report'] = classification_report(y, y_hat)
    return scores

def SingleTest(X_train, y_train, X_test, y_test, penaltyval, Cval):
    #Use Logistic Regression - Testing with dummy-y-variable
    
    #extract Article IDs
    trainID = X_train['article_id']
    testID = X_test['article_id']
    X_train = X_train.drop(columns=['article_id'])
    X_test = X_test.drop(columns=['article_id'])
    
    #define classifier
    ##penaltyval = 'l2'
    logReg = LogisticRegression(penalty=penaltyval, dual=False, tol=0.0001, C=Cval, fit_intercept=True, random_state=0, solver='liblinear')

    #Correction?? Build the classifier
    clfSingleTest = logReg.fit(X_train, y_train)
    # Save the classifier
    pickle.dump(clfSingleTest, open("ourClassifier.p", "wb"))

    # predict on train and test set
    y_train_predict = clfSingleTest.predict(X_train)
    y_test_predict = clfSingleTest.predict(X_test)
    
    # get log scores for train and test set
    y_train_log_scores = clfSingleTest.predict_log_proba(X_train)
    y_test_log_scores = clfSingleTest.predict_log_proba(X_test)
    
    
    
    #tie the scores and predictions to specific articles
    train_scores = pd.DataFrame(data=y_train_log_scores)
    train_scores['article_id'] = trainID.values
    train_scores['prediction'] = y_train_predict
    test_scores = pd.DataFrame(data=y_test_log_scores)
    test_scores['article_id'] = testID.values
    test_scores['prediction'] = y_test_predict

    ## Calculate Binary metrics
    columns = ['Precision','Recall', 'F1', 'Avg Precision', 'Accuracy']
    df = pd.DataFrame(index=['Train','Test'], columns=columns)
    
    TrainPrecision = precision_score(y_train, y_train_predict)
    TestPrecision = precision_score(y_test, y_test_predict)
    
    TrainRecall = accuracy_score(y_train, y_train_predict)
    TestRecall = accuracy_score(y_test, y_test_predict)
    
    Trainf1 = f1_score(y_train, y_train_predict, average='binary')
    Testf1 = f1_score(y_test, y_test_predict, average='binary')
    
    ## Calculate all metrics
    all_train_scores = all_metrics(y_train, y_train_predict)
    all_test_scores = all_metrics(y_test, y_test_predict)
    
    
    #Not to be confused with the ranking metric, mAP (mean average precision), this is simply the average of the P and R curve
    TrainAvgP = average_precision_score(y_train, y_train_predict)
    TestAvgP = average_precision_score(y_test, y_test_predict)
    
    TrainAccuracy = accuracy_score(y_train, y_train_predict)
    TestAccuracy = accuracy_score(y_test, y_test_predict)
    
    df.loc['Train'] = pd.Series({'Precision': TrainPrecision, 'Recall': TrainRecall, 'F1': Trainf1, 'Avg Precision': TrainAvgP, 'Accuracy': TrainAccuracy})
    df.loc['Test'] = pd.Series({'Precision': TestPrecision, 'Recall': TestRecall, 'F1': Testf1, 'Avg Precision': TestAvgP, 'Accuracy': TestAccuracy})
    return df, train_scores, test_scores, all_train_scores, all_test_scores

In [64]:
def SequentialSetRun(X, y, testsize):
   
    #Predicting on Real DataSet - Only 1 run
    num_articles = len(X) -1 #Subtract header row
    #testsize = 0.30
    trainsize = 1-testsize


    #Select first 70% as train
    X_train = X.iloc[:round(num_articles*trainsize)]
    y_train = y.iloc[:round(num_articles*trainsize)]

    #Following testsize (30% default) is test
    X_test = X.iloc[(round(num_articles*trainsize)):]
    y_test = y.iloc[(round(num_articles*trainsize)):]
   
    #Run SingleTest
    TestResults, train_scores, test_scores = SingleTest(X_train, y_train, X_test, y_test)

    return TestResults, train_scores, test_scores
    

In [65]:
def runLogReg(filename):
    X = LoadData(filename) # This would be named to whatever today's binEncoding file is called
    artID = X['article_id']
    X = X.drop(columns=['article_id'])
    #todo:extra cols
    xcols = [0,1,2,3,4,5,6]
    X = X.drop(X.columns[xcols], axis=1)
    print(X.head())
    classifier = pickle.load(open("ourClassifier.p", "rb"))
    
    y_predict = classifier.predict(X)
    # get log scores for train and test set
    y_log_proba = classifier.predict_log_proba(X)    
    
    #tie the scores and predictions to specific articles
    scores = pd.DataFrame(data=y_log_proba)
    scores['article_id'] = artID.values
    scores['prediction'] = y_predict
    
    thispath = Path().absolute()
    OUTPUT_DIR = os.path.join(thispath, "Data", "results_"+filename)
    pd.DataFrame.to_csv(scores, path_or_buf=OUTPUT_DIR)

In [None]:
X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy = LoadSplitData(0.30) #Decide the test size (% of total data)

dfSeqTest, train_res, test_res = SequentialSetRun(X, y, 0.30) #final number is % size of testing set
print("Actual Test Results")
print(dfSeqTest.head())
print(train_res.head())
print(test_res.head())

In [None]:
runLogReg("binEncoding.csv")

In [None]:
#def main():

# Get x, y, train, test, and dummy vars by loading and splitting data
# Right now there is a 30-70 split for test-train
X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy = LoadSplitData(0.30) #Decide the test size (% of total data)

#Sanity check with dummy variable:  output should be 1.0)
df, train_res, test_res = SingleTest(DummyX_train, Dummyy_train, DummyX_test, Dummyy_test)
print("Sanity Check - testing dummy vars")
print(df.head())

# Run the classifier!
dfSeqTest, train_res, test_res = SequentialSetRun(X, y, 0.30) #final number is % size of testing set
print("Actual Test Results")
print(dfSeqTest.head())

# Save the results
file_name = "baseline-" + file_name
thispath = Path().absolute()
OUTPUT_DIR = os.path.join(thispath, "Data", file_name)
pd.DataFrame.to_csv(X, path_or_buf=OUTPUT_DIR)

In [None]:
test_res.head(100)
df_filtered = test_res[(test_res.prediction == 1)]
df_filtered.sort_values(by=[1], ascending = False)

#  kfold CV with sequential splits


In [66]:
#kFold Cross Validation using Day Forward-Chaining
#We want to split the data into sequential folds
from sklearn.model_selection import GridSearchCV

#let k = # of folds to test on
k = 5

#y column index (set where the y value is located)
yindex = 1

#Checking indexing
newdata = data_xy
print(newdata.index)
print(len(newdata))
train_index = int(len(newdata) / k)
print(train_index)

RangeIndex(start=0, stop=3148, step=1)
3148
629


In [67]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

def getHyperparameters(X,y):
    #create regularistcation penalty space
    penalty = ['l1','l2'] #only l2 for now
    
    #Create regularization hyperparameter space
    C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    #C = np.logspace(0,4,10)

    
    #create hyperparemeter options
    parameters = dict(C=C, penalty=penalty)

    logistic = LogisticRegression()
    clf = GridSearchCV(logistic, parameters)


    best_model = clf.fit(X,y) #default 3 cross validation default
    
    # View best hyperparameters
    BestPenalty = best_model.best_estimator_.get_params()['penalty']
    BestC = best_model.best_estimator_.get_params()['C']
    
    print('Best Penalty:', BestPenalty)
    print('Best C:', BestC)
    
    return BestPenalty, BestC

In [68]:
#Nested KFold Cross Validation - Prints Precision value and informs of each hyperparam used for each outerfold
testPrec = []
testAcc = []
testRecall=[]
testScores=[]
trainScores=[]
for i in range(k): 
    #Get indexes for test and train data for split i in k
    incrementrows = int(len(newdata) / (k+1))
    
    train_index_start = 0
    train_index_end = train_index_start + (incrementrows * (i+1))
    test_index_start = train_index_end + 1
    
    #if it's the last iteration, add leftover articles to test set - (due to rounding)
    if i == (k-1):
        test_index_end = int(len(newdata))
    else:
        test_index_end = test_index_start + incrementrows
    
    #print(i, train_index_start, train_index_end, test_index_start, test_index_end)
    
    #Extract the train/validation split
    trainsplitsubset = data_xy.iloc[train_index_start:train_index_end]
    
    #dropping first column because trainsplitsubset has the y value
    Xtrain = trainsplitsubset.drop(trainsplitsubset.columns[yindex],axis=1)
    ytrain = trainsplitsubset.iloc[:,yindex] 
    
    #train/validate with GridSearchCV to get Hyperparameters first
    Penalty, C = getHyperparameters(Xtrain,ytrain)
    #print(C)
    
    #Extract the test set
    testsplitsubset = data_xy.iloc[test_index_start:test_index_end]
    Xtest = testsplitsubset.drop(testsplitsubset.columns[yindex], axis=1)
    ytest = testsplitsubset.iloc[:,yindex]
    
    #print(Xtest.head())
    #print(ytest.head())
  
    #Use these hyperparamers on outerfold
    df, train_scores, test_scores, all_train_scores, all_test_scores = SingleTest(Xtrain, ytrain, Xtest, ytest, Penalty, C)
    
    #focusing on precision (can access TestResults1 at different indices to evaluate more metrics)
    testPrec.append(df.iloc[1,0])
    testAcc.append(df.iloc[1,4])
    #print(testAcc)
    
    testScores.append(all_test_scores)
    trainScores.append(all_train_scores)
    
    #Print mean Precision score (average binary precision over k outer folds)
    meanPrecision = sum(testPrec) / float(len(testPrec))
        
    #Print Accuracy
    meanAccuracy = sum(testAcc) / float(len(testAcc))
    
    if i == k-1:
        print(meanPrecision)
        print(meanAccuracy)
    

Best Penalty: l1
Best C: 0.001


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best Penalty: l1
Best C: 0.001


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best Penalty: l1
Best C: 0.001


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best Penalty: l1
Best C: 0.001


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best Penalty: l1
Best C: 0.001
0.0
0.9919912510682677


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [69]:
print(testAcc)
print(testPrec)

[0.9904580152671756, 0.9923664122137404, 0.9923664122137404, 0.9904580152671756, 0.9943074003795066]
[0.0, 0.0, 0.0, 0.0, 0.0]


In [70]:
all_results['lrstem'] = {'test': testScores, 'train':trainScores}
import winsound
duration = 1000  # millisecond
freq = 440  # Hz
winsound.Beep(freq, duration)
winsound.Beep(600, 500)
scoredf = pd.DataFrame(testScores)
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.990458,0.009542,precision recall f1-score s...,"[[519, 0], [5, 0]]",0.0,0.0,0.0
1,0.992366,0.007634,precision recall f1-score s...,"[[520, 0], [4, 0]]",0.0,0.0,0.0
2,0.992366,0.007634,precision recall f1-score s...,"[[520, 0], [4, 0]]",0.0,0.0,0.0
3,0.990458,0.009542,precision recall f1-score s...,"[[519, 0], [5, 0]]",0.0,0.0,0.0
4,0.994307,0.005693,precision recall f1-score s...,"[[524, 0], [3, 0]]",0.0,0.0,0.0


## baseline
testScores[0]['classification_report']
'             precision    recall  f1-score   support
\n
\n          0       0.88      0.93      0.90       411
\n          1       0.67      0.55      0.60       113
\n
\navg / total       0.84      0.85      0.84       524
\n

In [14]:
# Baseline
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.818702,0.411,precision recall f1-score s...,"[[393, 7], [88, 36]]",0.431138,0.837209,0.290323
1,0.84542,0.434193,precision recall f1-score s...,"[[390, 26], [55, 53]]",0.566845,0.670886,0.490741
2,0.925573,0.363619,precision recall f1-score s...,"[[462, 10], [29, 23]]",0.541176,0.69697,0.442308
3,0.94084,0.251552,precision recall f1-score s...,"[[480, 11], [20, 13]]",0.45614,0.541667,0.393939
4,0.918406,0.314753,precision recall f1-score s...,"[[462, 15], [28, 22]]",0.505747,0.594595,0.44


## LR Stem
testScores[0]['classification_report']
'             precision    recall  f1-score   support
\n
\n          0       0.89      0.90      0.89       411
\n          1       0.61      0.58      0.59       113
\n
\navg / total       0.83      0.83      0.83       524
\n'

In [71]:
## lrStem
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.990458,0.009542,precision recall f1-score s...,"[[519, 0], [5, 0]]",0.0,0.0,0.0
1,0.992366,0.007634,precision recall f1-score s...,"[[520, 0], [4, 0]]",0.0,0.0,0.0
2,0.992366,0.007634,precision recall f1-score s...,"[[520, 0], [4, 0]]",0.0,0.0,0.0
3,0.990458,0.009542,precision recall f1-score s...,"[[519, 0], [5, 0]]",0.0,0.0,0.0
4,0.994307,0.005693,precision recall f1-score s...,"[[524, 0], [3, 0]]",0.0,0.0,0.0


## SB Stem
testScores[0]['classification_report']
'             precision    recall  f1-score   support
\n
\n          0       0.88      0.91      0.89       411
\n          1       0.62      0.56      0.59       113
\n
\navg / total       0.83      0.83      0.83       524
\n'

In [35]:
## sbStem
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.734733,0.600672,precision recall f1-score s...,"[[290, 18], [121, 95]]",0.577508,0.840708,0.439815
1,0.746183,0.559802,precision recall f1-score s...,"[[266, 63], [70, 125]]",0.652742,0.664894,0.641026
2,0.811069,0.47922,precision recall f1-score s...,"[[372, 11], [88, 53]]",0.517073,0.828125,0.375887
3,0.847328,0.426733,precision recall f1-score s...,"[[395, 22], [58, 49]]",0.550562,0.690141,0.457944
4,0.83112,0.421674,precision recall f1-score s...,"[[395, 13], [76, 43]]",0.491429,0.767857,0.361345


## WN Lemm
testScores[0]['classification_report']
'             precision    recall  f1-score   support
\n
\n          0       0.88      0.92      0.90       411
\n          1       0.65      0.57      0.61       113
\n
\navg / total       0.83      0.84      0.84       524
\n'

In [47]:
## wnLemm
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.805344,0.558562,precision recall f1-score s...,"[[338, 22], [80, 84]]",0.622222,0.792453,0.512195
1,0.807252,0.485515,precision recall f1-score s...,"[[343, 44], [57, 80]]",0.613027,0.645161,0.583942
2,0.862595,0.362341,precision recall f1-score s...,"[[426, 5], [67, 26]]",0.419355,0.83871,0.27957
3,0.906489,0.384547,precision recall f1-score s...,"[[448, 8], [41, 27]]",0.524272,0.771429,0.397059
4,0.870968,0.284386,precision recall f1-score s...,"[[431, 27], [41, 28]]",0.451613,0.509091,0.405797


## WN Lemm-V
testScores[0]['classification_report']
'             precision    recall  f1-score   support
\n
\n          0       0.88      0.93      0.90       411
\n          1       0.68      0.54      0.60       113
\n
\navg / total       0.84      0.85      0.84       524
\n'

In [59]:
## wnLemm-V
scoredf

Unnamed: 0,accuracy_score,average_precision_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
0,0.967557,0.247536,precision recall f1-score s...,"[[500, 4], [13, 7]]",0.451613,0.636364,0.35
1,0.975191,0.322959,precision recall f1-score s...,"[[503, 5], [8, 8]]",0.551724,0.615385,0.5
2,0.975191,0.212309,precision recall f1-score s...,"[[508, 0], [13, 3]]",0.315789,1.0,0.1875
3,0.980916,0.019084,precision recall f1-score s...,"[[514, 0], [10, 0]]",0.0,0.0,0.0
4,0.971537,0.028463,precision recall f1-score s...,"[[512, 0], [15, 0]]",0.0,0.0,0.0


In [72]:
 pickle.dump(all_results, open("all_results_300.p", "wb"))

# The rest of the code below is scrap

In [None]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

#def getHyperparameters(data_xy, train_index_start, train_index_end):
    #create regularistcation penalty space
penalty = ['l2'] #only l2 for now

#Create regularization hyperparameter space
C = list(frange(1,10,0.5))
#C = np.logspace(0,4,10)


#create hyperparemeter options
parameters = dict(C=C, penalty=penalty)

logistic = LogisticRegression()
clf = GridSearchCV(logistic, parameters)

#Get the train/validation split
trainsplitsubset = data_xy.iloc[train_index_start:train_index_end]

X = trainsplitsubset.drop(trainsplitsubset.columns[0],axis=1)
y = trainsplitsubset.iloc[:,0] #needs change

best_model = clf.fit(X,y) #3 cross validation default
# View best hyperparameters, if our model is stable, C's should not vary
BestPenalty = best_model.best_estimator_.get_params()['penalty']
BestC = best_model.best_estimator_.get_params()['C']

print('Best Penalty:', BestPenalty)
print('Best C:', BestC)
    
    #return BestPenalty, BestC

In [None]:
train_index_start = 0
train_index_end = 39

#validate_cutoff = round((266/3*2))
#trainvsplit = data_xy.iloc[0:validate_cutoff]
#validatesplit = data_xy.iloc[(vaildate_cutoff+1):266]
#trainvsplit.shape
#validatesplit.shape

In [None]:
ytrain.head()

In [None]:
best_model = clf.fit(Xtrain,ytrain) #3 cross validation default
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
#def kSequential_Split(k, itr, X, data_xy):
for row in newdata.index: #where newdata is our dataset
    #if it is, drop the row X
    if (row % k) == itr:
        #make sure there is a row to drop
        if row <= len(newdata.index):
            #print('Dropping article:', row)
            newdata = newdtata.drop(newdata.index[row])
XData = newdata.drop(columns=['whatever_youd_like'])  
yData = newdata['whatever_youd_like']

    #return XData, yData 

In [None]:
#We now have our subset of data, newX
#split into 70/30 (first 70 to next 30 chunk of data)
newX.head()
len(newX.index)
newX.shape


In [None]:
logReg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, random_state=0, solver='liblinear')
clf = logReg.fit(X_train, y_train)

#predict on all rows=articles from row 2 onwards
pred_train = clf.predict(X_train.values[:2,:]) 
pred_test = clf.predict(X_test.values[:2,:])

In [None]:
#Without Dummy Variable
TrainScores = clf.score(X_train,y_train)
TestScores = clf.score(X_test,y_test)

#Return the mean accuracy on the given test data and labels (should be 1.0)
print(TrainScores)
print(TestScores)

In [None]:



#Code for k-fold cross validation with random splits. We want sequential splits
def random_mean_ci(X, y, data_xy, num_tests):
    # train_results is a list of train accuracy results for the differrent random splits of the dataset
    train_results = []
    
    # test_results is a list of test accuracy results for the differrent random splits of the dataset
    test_results = []
    
    # Write your code here
    for i in range(num_tests):
        
        #METHOD 1: Random Datasplit (old version - not sequential date sampling)
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(1,1000))

        #METHOD 2: Sequential Datasplit Variation of k-fold Cross Valid. (new version - sequential date sampling)
        Xdata, ydata = kSequential_Split(num_tests, i, X, data_xy)
        
        
        # train a logistic regression classifier
        clf = LogisticRegression(C=1.0, penalty='l2', dual=False, tol=0.0001, fit_intercept=True, solver='liblinear').fit(X_train, y_train)

        # predict on train and test set
        y_train_predict = clf.predict(X_train)
        y_test_predict = clf.predict(X_test)

        # calculate train and test accuracy
        train_accuracy = accuracy_score(y_train, y_train_predict)
        test_accuracy = accuracy_score(y_test, y_test_predict)
        
        # Calculate train and test everything eslse
        train_scores = all_metrics(y_train, y_train_predict)
        test_scores = all_metrics(y_test, y_test_predict)               

        # report results, update the results array
        train_results.append(train_accuracy)
        test_results.append(test_accuracy)

    # calculate the train mean and the 95% confidence interval for the list of results
    train_mean = np.mean(train_results)
    train_ci_low, train_ci_high = stats.t.interval(0.95, len(train_results)-1, loc=train_mean, scale=stats.sem(train_results))

    # calculate the test mean and the 95% confidence interval for the list of results
    test_mean = np.mean(test_results)
    test_ci_low, test_ci_high = stats.t.interval(0.95, len(test_results)-1, loc=test_mean, scale=stats.sem(test_results))
    
    # validate return types
    assert isinstance(train_mean, float) and isinstance(train_ci_low, float) and isinstance(train_ci_high, float), "return types"
    assert isinstance(test_mean, float) and isinstance(test_ci_low, float) and isinstance(test_ci_high, float), "return types"
    
    return train_mean, train_ci_low, train_ci_high, test_mean, test_ci_low, test_ci_high, test_scores, train_scores

In [None]:
#Accuracy and CI over 10 runs
train_mean, train_low, train_high, test_mean, test_low, test_high,  test_scores, train_scores = random_mean_ci(X, y, data_xy, num_tests = 10)
print("Train mean accuracy over 10 random splits: {}".format(train_mean))
print("Train confidence interval over 10 random splits: [{}, {}]".format(train_low, train_high))
print("Test mean accuracy over 10 random splits: {}".format(test_mean))
print("Test confidence interval over 10 random splits: [{}, {}]".format(test_low, test_high))

An initial run with Retail only Articles, to determine market moving or not (Randomly split):

Train mean accuracy over 10 random splits: 0.7402896081771722
Train confidence interval over 10 random splits: [0.7288090750212393, 0.751770141333105]
Test mean accuracy over 10 random splits: 0.5650793650793651
Test confidence interval over 10 random splits: [0.547570102462968, 0.5825886276957621]


In [None]:
Results = LoadData("resultsbinEncoding.csv")
Articles = LoadData("cleanedarticles.csv")

Joined = Results.join(Articles, on=None, how='left', lsuffix='', rsuffix='', sort=False)
