In [1]:
# Import required packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
# Load dataset
data = pd.read_csv('USD-to-SGD-2019-1-minute-updated.csv')

In [3]:
# Show the first 5 entries in the dataset
data.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2019-01-01 17:00,1.36239,1.3624,1.36239,1.3624,0.0
1,2019-01-01 17:01,1.36242,1.36242,1.36238,1.36238,0.0
2,2019-01-01 17:02,1.36256,1.36256,1.36256,1.36256,0.0
3,2019-01-01 17:03,1.36255,1.36255,1.36255,1.36255,0.0
4,2019-01-01 17:04,1.36256,1.3627,1.36256,1.3627,0.0


In [4]:
# Show general information of the values in each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370612 entries, 0 to 370611
Data columns (total 6 columns):
timestamp    370612 non-null object
open         370612 non-null float64
high         370612 non-null float64
low          370612 non-null float64
close        370612 non-null float64
volume       370612 non-null float64
dtypes: float64(5), object(1)
memory usage: 17.0+ MB


# Preprocessing data

In [5]:
# This sorts out closing prices into either bad or good depending on the mean of all the closing prices in 2019
mean_closing_price = sum(data['close']) / 370612
# bins determines the range, from 1 to mean_closing_price is bad, from mean_closing_price to 2 is good
bins = (1, mean_closing_price, 2)
group_names = ['bad', 'good']
data['close'] = pd.cut(data['close'], bins = bins, labels = group_names)
data['close'].unique()

[bad, good]
Categories (2, object): [bad < good]

In [6]:
# LabelEncoder applies bad = 0 and good = 1
le = LabelEncoder()

In [7]:
# fit_transform replaces the closing prices with 0s and 1s
data['close'] = le.fit_transform(data['close'])

In [8]:
# Show the number of bad closing prices and good closing prices
data['close'].value_counts()

0    230832
1    139780
Name: close, dtype: int64

# Response variable and Feature variable

In [9]:
# Separate the dataset into response variable(y) and feature variable(X) to be used in the model
# X is the entry with all the columns except the closing price
# y is the closing price
# We are trying to predict the closing price from the other features
# Drop timestamp and volume as well because those will not help in predicting the closing price
X = data.drop(['timestamp', 'volume', 'close'], axis = 1)
y = data['close']

# Train/Test split

In [10]:
# The dataset is split into a training set and a testing set
# The training set creates the model
# The testing set measures the accuracy of the model
# Typically 80% for training, and 20% for testing

# train_test_split splits dataset into random train and test subsets
# Each time you run it without specifying random_state, you will get a different result because the split is different
# By specifying the same random_state for every run, the outputs for every run will be always the same.
# It doesn't matter what the actual random_state number is as long as it is consistent.
# This is important for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Standard scaling

In [11]:
# Applying standard scaling to get optimized result
# Standard scaling normalizes(mean = 0 and standard deviation = 1) the features/variables/columns of X individually
# before applying machine learning techniques
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# SVM Classifier

In [12]:
# Create the model
clf = svm.SVC()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)

In [13]:
# Look at the first 20 entries predicted, 10 closing prices are bad and 10 closing prices are good
# (Compared to mean of all the closing prices in 2019)
pred_clf[:20]
# The probability of closing prices being 50% good and 50% bad each is a sign of the forex's self-correction mechanism

array([1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1])

In [14]:
# See how well the model performed
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))
# This model is good at predicting both bad closing prices and good closing prices
# y-axis is the actual values and x-axis is the predicted values
# For the confusion matrix, [[a  b]
#                            [c  d]]
# From a, percentage of bad closing prices predicted correct = 46257/(46257 + 48)
# From b, percentage of bad closing prices predicted wrong = 48/(46257 + 48)
# From c, percentage of good closing prices predicted wrong = 30/(27788 + 30)
# From d, percentage of good closing prices predicted correct = 27788/(27788 + 30)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     46305
           1       1.00      1.00      1.00     27818

    accuracy                           1.00     74123
   macro avg       1.00      1.00      1.00     74123
weighted avg       1.00      1.00      1.00     74123

[[46257    48]
 [   30 27788]]


In [15]:
# Get the accuracy of the model as a final percentage
cm = accuracy_score(y_test, pred_clf)
cm

0.9989476950474212