In [None]:
# Install dependencies
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle scipy==1.10.1

# Inline plots
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
from numerapi import NumerAPI
napi = NumerAPI()

import pandas as pd

# list the datasets and available versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))

# Set data version to one of the latest datasets
DATA_VERSION = "v4.3"

In [None]:
import json

# # download the feature metadata file
napi.download_dataset(f"{DATA_VERSION}/features.json");

# read the metadata and display
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
feature_sets = feature_metadata["feature_sets"]

small_feature_set = feature_sets["small"]
medium_feature_set = feature_sets["medium"]

# # Download the training data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet");

# Load only the "small" and "medium" feature set to
# Use the "all" feature set to use all features
small_data = pd.read_parquet(
    f"{DATA_VERSION}/train_int8.parquet",
    columns=["era", "target"] + small_feature_set
)
medium_data = pd.read_parquet(
    f"{DATA_VERSION}/train_int8.parquet",
    columns=["era", "target"] + medium_feature_set
)

# Downsample to every 4th era to reduce memory usage and speedup model training (suggested for Colab free tier)
# Comment out the line below to use all the data
small_data = small_data[small_data["era"].isin(small_data["era"].unique()[::4])]
medium_data = medium_data[medium_data["era"].isin(medium_data["era"].unique()[::4])]


In [None]:
small_X_train = small_data[small_feature_set]
small_y_train = small_data["target"]
medium_X_train = medium_data[medium_feature_set]
medium_y_train = medium_data["target"]

In [None]:
# find correlation between features and outcome
corr = medium_X_train.corrwith(medium_y_train)
corr.sort_values()

feature_enzymatic_poorest_advocaat             -0.011773
feature_unswaddled_inenarrable_goody           -0.011449
feature_wetter_unbaffled_loma                  -0.007477
feature_floatiest_quintuplicate_carpentering   -0.007150
feature_unbarking_apolitical_hibernian         -0.007114
                                                  ...   
feature_hunchbacked_unturning_meditation        0.010299
feature_denuded_typed_wattmeter                 0.010343
feature_pruinose_raploch_roubaix                0.010585
feature_leaky_overloaded_rhodium                0.011407
feature_simpatico_cadential_pup                 0.012055
Length: 705, dtype: float64

In [None]:
# Find the most correlated features
corr_sorted = abs(corr).sort_values(ascending=False).head()
corr_sorted

feature_simpatico_cadential_pup         0.012055
feature_enzymatic_poorest_advocaat      0.011773
feature_unswaddled_inenarrable_goody    0.011449
feature_leaky_overloaded_rhodium        0.011407
feature_pruinose_raploch_roubaix        0.010585
dtype: float64

In [None]:
# # get 100 most correlated features
# medium100_feature_set = corr.columns # can't get it to work
# medium100_X_train = medium_X_train[medium100_feature_set]

In [None]:
small_y_train = pd.Categorical(small_y_train).codes
medium_y_train = pd.Categorical(medium_y_train).codes

In [None]:
# feature_set_sizes = {'small': [small_X_train,small_y_train], 'medium': [medium_X_train,medium_y_train], 'medium100': [medium100_X_train,medium_y_train]}
feature_set_sizes = {'small': [small_X_train,small_y_train], 'medium': [medium_X_train,medium_y_train]}
kernels = ['linear', 'poly', 'rbf']
lambds = [1.0, 0.1, 0.01, 0.001, 0.0001]
models = {}
train_predictions = {}
train_scores = {}

Manually implemented SVM Pegasos algorithm

In [None]:
import sklearn as sk
from sklearn.model_selection import train_test_split

In [None]:
# Our SVM algorithm Pegasos expects the labels to be encoded as +1 and -1
# Here we encode one digit as 1, and we encode the other 9 digits as -1
def one_vs_rest_encoding(y, outcome = 0):

    # Let y_encoded be an numpy array of encoded digits, with 1 for the digit we want to predict, and -1 for the rest
    # This may take several lines of code, but please store your final encoding in y_encoded

    y_encoded = np.array([1 if label == outcome else -1 for label in y])

    return  y_encoded

In [None]:
# Compute the score for each example in X
def score(X, w): #keep
    # To do
    return np.dot(X,w)


In [None]:
def svm_objective(w, X, y, lambda1=.1): # keep
    # To do. This part may require several lines of code.
    # Store your answer in result.

    result = (lambda1/2) * np.linalg.norm(w)**2 + np.sum([max(0, 1-y[i]*(np.dot(X.iloc[i],w))) for i in range(y.shape[0])])
    return result

In [None]:
# stochastic sub-gradient descent
def pegasos(X_train, y_train, lambda1=0.08, num_iters = 3): #keep

    # Hyperparameters: threshold, lambda1

    # parameters
    N = X_train.shape[0]
    d = X_train.shape[1]

    t = 0
    # Initial weight vector
    w = np.ones((d,))

    for iter in range(num_iters):
        # Calculate and print the objective value
        print('Iteration %d. J: %.6f' % (iter, svm_objective(w, X_train, y_train)))

        for i in range (N):
            t = t + 1
            alpha = 1/(lambda1*t)
            # Complete the following code to find w. This will require several lines of code.
            if y_train[i]*(np.dot(w,X_train.iloc[i])) >= 1:
                w = w - alpha*lambda1*w
            else:
                w = w - alpha*(lambda1*w - y_train[i]*X_train.iloc[i])

    return w

In [None]:
# Hyperparameters - You will experiment with these in Step 3
lambda1 = 0.1  # Regularization parameter
num_iters = 3   # Number of iterations

# A) Create the 10 classifiers
labels = [0,1,2,3,4]
w_vals= {}
val_scores = {}

for i in labels: # Modify
    # Perform one-vs-rest for labels[i]
    # To do: Relabel the y labels in the train set to either 1 or -1 using one_vs_rest_encoding
    y_encoded = one_vs_rest_encoding(y_train, outcome=labels[i])

    # To do: Train the Pegasos algorithm on X_train X_train and  y_encoded to get the weight vector
    w_vals[i] = pegasos(X_train, y_encoded, lambda1=lambda1, num_iters = num_iters)

    # Using the validation set, estimate accuracy for one-vs-rest classifier for labels[i]
    # To do:  Relabel the y labels in the validation set to either 1 or -1 using one_vs_rest_encoding
    y_encoded_val = one_vs_rest_encoding(y_val, outcome=labels[i])

    # This section may require more than one line of code.
    # To do: Calculate an accuracy for one-vs-rest classifier for labels[i]

    val_scores[i] = score(X_val, w_vals[i])

Iteration 0. J: 813205914.250000
Iteration 1. J: 63830.941081
Iteration 2. J: 62768.361842
Iteration 0. J: 683528870.250000
Iteration 1. J: 252368.872695
Iteration 2. J: 251003.288436
Iteration 0. J: 428009511.250000
Iteration 1. J: 527623.472848
Iteration 2. J: 525549.161246
Iteration 0. J: 683285420.250000
Iteration 1. J: 253893.738209
Iteration 2. J: 251020.553854
Iteration 0. J: 813100828.250000
Iteration 1. J: 62657.819568
Iteration 2. J: 62824.293958


In [None]:
# Check your work. With the proper amount of iterations, your values should range from 0.95 to 0.99
for i in labels:
     print(i,": score:", val_scores[i])

0 : score: [-1.09015643 -0.98939912 -0.9769     ... -1.0409848  -1.04311839
 -1.03618421]
1 : score: [-1.11572107 -0.96358153 -0.95828604 ... -1.01761975 -1.01535968
 -1.01246723]
2 : score: [-0.48464472  0.693264   -0.28555513 ...  0.56222395  0.59235821
  0.85890896]
3 : score: [-1.07980191 -0.97543728 -0.99541497 ... -1.01859306 -1.0084035
 -1.0110485 ]
4 : score: [-1.1045417  -0.99383677 -0.99228607 ... -1.04662122 -1.02662703
 -1.00696277]


In [None]:
# B) Label Prediction for Each Validation Set Example
# To do: Loop through each sample in the validation set and assign it a label based on the highest score.
predictions = []
for i in range(X_val.shape[0]):
    last_score = -10e9
    for key, score in val_scores.items():
        if score[i] > last_score:
            last_score = score[i]
            last_key = key
    predictions.append(last_key)

In [None]:
# C) Accuracy on Validation Set Using Predicted Labels
# Initialize an array 'eval1' of length N (number of examples in the validation set)
# 'eval1' will hold 1 for correctly predicted digits and 0 for incorrect predictions.
# You may use more than one line of code if needed for the following tasks.

# To do:  Compute 'eval1' based on the comparison of predicted and actual labels
eval = np.zeros(y_val.shape[0])
for i, predicted in enumerate(predictions):
    if predicted == y_val[i]:
        eval[i] = 1

# To do: Calculate the accuracy on the validation set. It should be approximately 0.89.
accuracy = np.sum(eval) / y_val.shape[0]

print("Accuracy Score:",accuracy)

Accuracy Score: 0.4884703452462651
