In [1]:
# Importing modules
%matplotlib inline
from __future__ import print_function
import pandas as pd
import numpy as np
import scipy.stats as stats
import sklearn
import random
import os
from pathlib import Path
from sklearn.linear_model import *
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report 
import pickle

In [4]:
def LoadSplitData(split):
    data_xy = LoadData("binEncoding.csv")
    #data_xy = data_xy.drop(columns= ['Unnamed: 0'])
    
    #drop the target variables you do not need (1-7), market moving or not ratings
    colsxy= [1,2,3,5,6,7] #todo: adjust, this currently removes unnecessary y columns
    data_xy = data_xy.drop(data_xy.columns[colsxy],axis=1)
    
    # separates out X & Y columns, but keeps article ID with X for split
    X = data_xy.drop(data_xy.columns[1],axis=1)
    y = data_xy.iloc[:,1] #needs change
    #Get dummy test/train set 
    DummyX_train, DummyX_test, Dummyy_train, Dummyy_test = train_test_split(data_xy, y, test_size=split, random_state=42)
    
    return X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy

def LoadData(filename):
    DATA_DIR = "Data"
    ENCODING_DIR = os.path.join(DATA_DIR, filename)
    data = pd.read_csv(ENCODING_DIR)
    return data


In [3]:
def SingleTest(X_train, y_train, X_test, y_test):
    #Use Logistic Regression - Testing with dummy-y-variable
    
    #extract Article IDs
    trainID = X_train['article_id']
    testID = X_test['article_id']
    X_train = X_train.drop(columns=['article_id'])
    X_test = X_test.drop(columns=['article_id'])
    
    #define classifier
    logReg = LogisticRegression(penalty=penaltyval, dual=False, tol=0.0001, C=Cval, fit_intercept=True, random_state=0, solver='liblinear')

    #Correction?? Build the classifier
    clfSingleTest = logReg.fit(X_train, y_train)
    # Save the classifier
    pickle.dump(clfSingleTest, open("ourClassifier.p", "wb"))

    # predict on train and test set
    y_train_predict = clfSingleTest.predict(X_train)
    y_test_predict = clfSingleTest.predict(X_test)
    
    # get log scores for train and test set
    y_train_log_scores = clfSingleTest.predict_log_proba(X_train)
    y_test_log_scores = clfSingleTest.predict_log_proba(X_test)
    
    
    
    #tie the scores and predictions to specific articles
    train_scores = pd.DataFrame(data=y_train_log_scores)
    train_scores['article_id'] = trainID.values
    train_scores['prediction'] = y_train_predict
    test_scores = pd.DataFrame(data=y_test_log_scores)
    test_scores['article_id'] = testID.values
    test_scores['prediction'] = y_test_predict

    ## Calculate Binary metrics
    columns = ['Precision','Recall', 'F1', 'Avg Precision', 'Accuracy']
    df = pd.DataFrame(index=['Train','Test'], columns=columns)
    
    TrainPrecision = precision_score(y_train, y_train_predict)
    TestPrecision = precision_score(y_test, y_test_predict)
    
    TrainRecall = accuracy_score(y_train, y_train_predict)
    TestRecall = accuracy_score(y_test, y_test_predict)
    
    Trainf1 = f1_score(y_train, y_train_predict, average='binary')
    Testf1 = f1_score(y_test, y_test_predict, average='binary')
    
    #Not to be confused with the ranking metric, mAP (mean average precision), this is simply the average of the P and R curve
    TrainAvgP = average_precision_score(y_train, y_train_predict)
    TestAvgP = average_precision_score(y_test, y_test_predict)
    
    TrainAccuracy = accuracy_score(y_train, y_train_predict)
    TestAccuracy = accuracy_score(y_test, y_test_predict)
    
    df.loc['Train'] = pd.Series({'Precision': TrainPrecision, 'Recall': TrainRecall, 'F1': Trainf1, 'Avg Precision': TrainAvgP, 'Accuracy': TrainAccuracy})
    df.loc['Test'] = pd.Series({'Precision': TestPrecision, 'Recall': TestRecall, 'F1': Testf1, 'Avg Precision': TestAvgP, 'Accuracy': TestAccuracy})
    return df, train_scores, test_scores

In [4]:
def SequentialSetRun(X, y, testsize):
   
    #Predicting on Real DataSet - Only 1 run
    num_articles = len(X) -1 #Subtract header row
    #testsize = 0.30
    trainsize = 1-testsize


    #Select first 70% as train
    X_train = X.iloc[:round(num_articles*trainsize)]
    y_train = y.iloc[:round(num_articles*trainsize)]

    #Following testsize (30% default) is test
    X_test = X.iloc[(round(num_articles*trainsize)):]
    y_test = y.iloc[(round(num_articles*trainsize)):]
   
    #Run SingleTest
    TestResults, train_scores, test_scores = SingleTest(X_train, y_train, X_test, y_test)

    return TestResults, train_scores, test_scores
    

In [27]:
def runLogReg(filename):
    X = LoadData(filename) # This would be named to whatever today's binEncoding file is called
    artID = X['article_id']
    X = X.drop(columns=['article_id'])
    #todo:extra cols
    xcols = [0,1,2,3,4,5,6]
    X = X.drop(X.columns[xcols], axis=1)
    print(X.head())
    classifier = pickle.load(open("ourClassifier.p", "rb"))
    
    y_predict = classifier.predict(X)
    # get log scores for train and test set
    y_log_proba = classifier.predict_log_proba(X)    
    
    #tie the scores and predictions to specific articles
    scores = pd.DataFrame(data=y_log_proba)
    scores['article_id'] = artID.values
    scores['prediction'] = y_predict
    
    thispath = Path().absolute()
    OUTPUT_DIR = os.path.join(thispath, "Data", "results_"+filename)
    pd.DataFrame.to_csv(scores, path_or_buf=OUTPUT_DIR)

In [7]:
X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy = LoadSplitData(0.30) #Decide the test size (% of total data)

dfSeqTest, train_res, test_res = SequentialSetRun(X, y, 0.30) #final number is % size of testing set
print("Actual Test Results")
print(dfSeqTest.head())
print(train_res.head())
print(test_res.head())

Actual Test Results
      Precision    Recall        F1 Avg Precision  Accuracy
Train  0.741007  0.808348  0.477958      0.422371  0.808348
Test   0.222222  0.710317  0.215054       0.19709  0.710317
          0         1  article_id  prediction
0 -0.180942 -1.798685           0           0
1 -0.049059 -3.039169           1           0
2 -0.098735 -2.364280           2           0
3 -0.167371 -1.870064           3           0
4 -0.075243 -2.624417           4           0
          0         1  article_id  prediction
0 -0.339319 -1.245681        1174           0
1 -0.592713 -0.804806        1175           0
2 -0.236461 -1.557873        1176           0
3 -0.120988 -2.171945        1177           0
4 -0.331991 -1.264054        1178           0


In [28]:
runLogReg("binEncoding.csv")

   target  stores  online  retailer  child  inflation  commerce  bank  \
0       0       1       0         1      0          0         0     0   
1       1       0       1         0      0          0         1     0   
2       1       0       0         0      0          0         1     0   
3       0       1       0         1      0          0         0     0   
4       1       0       0         0      0          0         0     0   

   company  central     ...       momentum  areas  nearly  against  home  \
0        1        0     ...              0      0       0        0     0   
1        1        1     ...              0      0       0        0     1   
2        1        0     ...              0      0       0        1     0   
3        1        0     ...              0      0       0        0     0   
4        1        0     ...              1      0       0        0     0   

   prime  premium  adjusted  investor  marketplace  
0      0        0         0         0            0 

In [8]:
#def main():

# Get x, y, train, test, and dummy vars by loading and splitting data
# Right now there is a 30-70 split for test-train
X, y, DummyX_train, DummyX_test, Dummyy_train, Dummyy_test, data_xy = LoadSplitData(0.30) #Decide the test size (% of total data)

#Sanity check with dummy variable:  output should be 1.0)
df, train_res, test_res = SingleTest(DummyX_train, Dummyy_train, DummyX_test, Dummyy_test)
print("Sanity Check - testing dummy vars")
print(df.head())

# Run the classifier!
dfSeqTest, train_res, test_res = SequentialSetRun(X, y, 0.30) #final number is % size of testing set
print("Actual Test Results")
print(dfSeqTest.head())

# Save the results
thispath = Path().absolute()
OUTPUT_DIR = os.path.join(thispath, "Data", file_name)
pd.DataFrame.to_csv(X, path_or_buf=OUTPUT_DIR)

Sanity Check - testing dummy vars
       Precision  Recall  F1  Avg Precision  Accuracy
Train          1       1   1              1         1
Test           1       1   1              1         1
Actual Test Results
      Precision    Recall        F1 Avg Precision  Accuracy
Train  0.840249  0.848382  0.819838      0.758562  0.848382
Test    0.59375  0.672619  0.535211      0.487676  0.672619


In [79]:
test_res.head(100)
df_filtered = test_res[(test_res.prediction == 1)]
df_filtered.sort_values(by=[1], ascending = False)

Unnamed: 0,0,1,article_id,prediction
163,-3.570083,-0.028557,1337,1
133,-3.095858,-0.046291,1307,1
260,-2.674751,-0.071414,1434,1
261,-2.674751,-0.071414,1435,1
469,-2.590717,-0.077925,1643,1
449,-2.590198,-0.077967,1623,1
141,-2.228054,-0.113995,1315,1
498,-2.187695,-0.118981,1672,1
114,-2.162658,-0.122189,1288,1
148,-2.033342,-0.140294,1322,1


#  kfold CV with sequential splits


In [15]:
#kFold Cross Validation using Day Forward-Chaining
#We want to split the data into sequential folds
from sklearn.model_selection import GridSearchCV

#let k = # of folds to test on
k = 5

#y column index (set where the y value is located)
yindex = 0

#Checking indexing
newdata = data_xy
print(newdata.index)
print(len(newdata))
train_index = int(len(newdata) / k)
print(train_index)

RangeIndex(start=0, stop=801, step=1)
801
160


In [18]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

def getHyperparameters(X,y):
    #create regularistcation penalty space
    penalty = ['l1','l2'] #only l2 for now
    
    #Create regularization hyperparameter space
    C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    #C = np.logspace(0,4,10)

    
    #create hyperparemeter options
    parameters = dict(C=C, penalty=penalty)

    logistic = LogisticRegression()
    clf = GridSearchCV(logistic, parameters)

    best_model = clf.fit(X,y) #default 3 cross validation default
    
    # View best hyperparameters
    BestPenalty = best_model.best_estimator_.get_params()['penalty']
    BestC = best_model.best_estimator_.get_params()['C']
    
    print('Best Penalty:', BestPenalty)
    print('Best C:', BestC)
    
    return BestPenalty, BestC

In [26]:
#Nested KFold Cross Validation - Prints Precision value and informs of each hyperparam used for each outerfold
testPrec = []
testAcc = []

for i in range(k): 
    #Get indexes for test and train data for split i in k
    incrementrows = int(len(newdata) / (k+1))
    
    train_index_start = 0
    train_index_end = train_index_start + (incrementrows * (i+1))
    test_index_start = train_index_end + 1
    
    #if it's the last iteration, add leftover articles to test set - (due to rounding)
    if i == (k-1):
        test_index_end = int(len(newdata))
    else:
        test_index_end = test_index_start + incrementrows
    
    #print(i, train_index_start, train_index_end, test_index_start, test_index_end)
    
    #Extract the train/validation split
    trainsplitsubset = data_xy.iloc[train_index_start:train_index_end]
    
    #dropping first column because trainsplitsubset has the y value
    Xtrain = trainsplitsubset.drop(trainsplitsubset.columns[yindex],axis=1)
    ytrain = trainsplitsubset.iloc[:,yindex] 
    
    #train/validate with GridSearchCV to get Hyperparameters first
    Penalty, C = getHyperparameters(Xtrain,ytrain)
    #print(C)
    
    #Extract the test set
    testsplitsubset = data_xy.iloc[test_index_start:test_index_end]
    Xtest = testsplitsubset.drop(testsplitsubset.columns[yindex], axis=1)
    ytest = testsplitsubset.iloc[:,yindex]
    
    #print(Xtest.head())
    #print(ytest.head())
  
    #Use these hyperparamers on outerfold
    TestResults1 = SingleTest(Xtrain, ytrain, Xtest, ytest, Penalty, C)
    
    #focusing on precision (can access TestResults1 at different indices to evaluate more metrics)
    testPrec.append(TestResults1.iloc[1,0])
    testAcc.append(TestResults1.iloc[1,4])
    #print(testAcc)
    
    #Print mean Precision score (average binary precision over k outer folds)
    meanPrecision = sum(testPrec) / float(len(testPrec))
        
    #Print Accuracy
    meanAccuracy = sum(testAcc) / float(len(testAcc))
    
    if i == k-1:
        print(meanPrecision)
        print(meanAccuracy)
    

Best Penalty: l2
Best C: 0.001
Best Penalty: l2
Best C: 0.1
Best Penalty: l2
Best C: 0.1
Best Penalty: l2
Best C: 0.1
Best Penalty: l2
Best C: 0.01
0.5733712317780818
0.5939292676134781


# The rest of the code below is scrap

In [None]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

#def getHyperparameters(data_xy, train_index_start, train_index_end):
    #create regularistcation penalty space
penalty = ['l2'] #only l2 for now

#Create regularization hyperparameter space
C = list(frange(1,10,0.5))
#C = np.logspace(0,4,10)


#create hyperparemeter options
parameters = dict(C=C, penalty=penalty)

logistic = LogisticRegression()
clf = GridSearchCV(logistic, parameters)

#Get the train/validation split
trainsplitsubset = data_xy.iloc[train_index_start:train_index_end]

X = trainsplitsubset.drop(trainsplitsubset.columns[0],axis=1)
y = trainsplitsubset.iloc[:,0] #needs change

best_model = clf.fit(X,y) #3 cross validation default
# View best hyperparameters, if our model is stable, C's should not vary
BestPenalty = best_model.best_estimator_.get_params()['penalty']
BestC = best_model.best_estimator_.get_params()['C']

print('Best Penalty:', BestPenalty)
print('Best C:', BestC)
    
    #return BestPenalty, BestC

In [None]:
train_index_start = 0
train_index_end = 39

#validate_cutoff = round((266/3*2))
#trainvsplit = data_xy.iloc[0:validate_cutoff]
#validatesplit = data_xy.iloc[(vaildate_cutoff+1):266]
#trainvsplit.shape
#validatesplit.shape

In [None]:
ytrain.head()

In [None]:
best_model = clf.fit(Xtrain,ytrain) #3 cross validation default
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
#def kSequential_Split(k, itr, X, data_xy):
for row in newdata.index: #where newdata is our dataset
    #if it is, drop the row X
    if (row % k) == itr:
        #make sure there is a row to drop
        if row <= len(newdata.index):
            #print('Dropping article:', row)
            newdata = newdtata.drop(newdata.index[row])
XData = newdata.drop(columns=['whatever_youd_like'])  
yData = newdata['whatever_youd_like']

    #return XData, yData 

In [None]:
#We now have our subset of data, newX
#split into 70/30 (first 70 to next 30 chunk of data)
newX.head()
len(newX.index)
newX.shape


In [None]:
logReg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, random_state=0, solver='liblinear')
clf = logReg.fit(X_train, y_train)

#predict on all rows=articles from row 2 onwards
pred_train = clf.predict(X_train.values[:2,:]) 
pred_test = clf.predict(X_test.values[:2,:])

In [None]:
#Without Dummy Variable
TrainScores = clf.score(X_train,y_train)
TestScores = clf.score(X_test,y_test)

#Return the mean accuracy on the given test data and labels (should be 1.0)
print(TrainScores)
print(TestScores)

In [None]:
#Code for k-fold cross validation with random splits. We want sequential splits
def random_mean_ci(X, y, data_xy, num_tests):
    # train_results is a list of train accuracy results for the differrent random splits of the dataset
    train_results = []
    
    # test_results is a list of test accuracy results for the differrent random splits of the dataset
    test_results = []
    
    # Write your code here
    for i in range(num_tests):
        
        #METHOD 1: Random Datasplit (old version - not sequential date sampling)
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(1,1000))

        #METHOD 2: Sequential Datasplit Variation of k-fold Cross Valid. (new version - sequential date sampling)
        Xdata, ydata = kSequential_Split(num_tests, i, X, data_xy)
        
        
        # train a logistic regression classifier
        clf = LogisticRegression(C=1.0, penalty='l2', dual=False, tol=0.0001, fit_intercept=True, solver='liblinear').fit(X_train, y_train)

        # predict on train and test set
        y_train_predict = clf.predict(X_train)
        y_test_predict = clf.predict(X_test)

        # calculate train and test accuracy
        train_accuracy = accuracy_score(y_train, y_train_predict)
        test_accuracy = accuracy_score(y_test, y_test_predict)

        # report results, update the results array
        train_results.append(train_accuracy)
        test_results.append(test_accuracy)

    # calculate the train mean and the 95% confidence interval for the list of results
    train_mean = np.mean(train_results)
    train_ci_low, train_ci_high = stats.t.interval(0.95, len(train_results)-1, loc=train_mean, scale=stats.sem(train_results))

    # calculate the test mean and the 95% confidence interval for the list of results
    test_mean = np.mean(test_results)
    test_ci_low, test_ci_high = stats.t.interval(0.95, len(test_results)-1, loc=test_mean, scale=stats.sem(test_results))
    
    # validate return types
    assert isinstance(train_mean, float) and isinstance(train_ci_low, float) and isinstance(train_ci_high, float), "return types"
    assert isinstance(test_mean, float) and isinstance(test_ci_low, float) and isinstance(test_ci_high, float), "return types"
    
    return train_mean, train_ci_low, train_ci_high, test_mean, test_ci_low, test_ci_high

In [None]:
#Accuracy and CI over 10 runs
train_mean, train_low, train_high, test_mean, test_low, test_high = random_mean_ci(X, y, data_xy, num_tests = 10)
print("Train mean accuracy over 10 random splits: {}".format(train_mean))
print("Train confidence interval over 10 random splits: [{}, {}]".format(train_low, train_high))
print("Test mean accuracy over 10 random splits: {}".format(test_mean))
print("Test confidence interval over 10 random splits: [{}, {}]".format(test_low, test_high))

An initial run with Retail only Articles, to determine market moving or not (Randomly split):

Train mean accuracy over 10 random splits: 0.7402896081771722
Train confidence interval over 10 random splits: [0.7288090750212393, 0.751770141333105]
Test mean accuracy over 10 random splits: 0.5650793650793651
Test confidence interval over 10 random splits: [0.547570102462968, 0.5825886276957621]


In [None]:
Results = LoadData("resultsbinEncoding.csv")
Articles = LoadData("cleanedarticles.csv")

Joined = Results.join(Articles, on=None, how='left', lsuffix='', rsuffix='', sort=False)
