In [140]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression

## Data Loading

In [141]:
directory = 'Resources/Price-Data'
all_files = os.listdir(directory)
df_list = []

# Loads in all the csv's and merge them into one merged_df
for file in all_files:
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory, file))
        # Extract the first three letters of the filename
        file_prefix = file[:3]
        # Create a new column with the file prefix value for each row
        df['Index'] = file_prefix
        df_list.append(df)

merged_df = pd.concat(df_list)

merged_df.to_csv('Resources/merged_df.csv', index=False)

merged_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Index
0,2003-12-01,3.672,3.672,3.6715,3.6715,0,AED
1,2003-12-02,3.6709,3.672,3.6708,3.6715,0,AED
2,2003-12-03,3.6719,3.672,3.6715,3.6715,0,AED
3,2003-12-04,3.6718,3.672,3.6715,3.6715,0,AED
4,2003-12-05,3.6719,3.672,3.6718,3.6718,0,AED


## Select Currency

In [142]:
# Select currency index from merged_df
index_value = 'GNF' 

# Filter the DataFrame for rows with the specific index value
df_currency = merged_df[merged_df['Index'] == index_value]

df_currency.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Index
0,2003-12-01,1954.099976,1954.099976,1954.099976,1954.099976,0,GNF
1,2003-12-02,1951.400024,1951.400024,1951.400024,1951.400024,0,GNF
2,2003-12-03,1954.0,1954.0,1954.0,1954.0,0,GNF
3,2003-12-04,1952.800049,1952.800049,1952.800049,1952.800049,0,GNF
4,2003-12-05,1953.400024,1953.400024,1953.400024,1953.400024,0,GNF


## Data Cleaning

In [143]:
# Determine which data points are outside of the 1.5*IQR range
quartiles = np.quantile(df_currency["Open"], [0.25, 0.75])
iqr = quartiles[1]-quartiles[0]
lower_bound = quartiles[0]-(1.5*iqr)
upper_bound = quartiles[1]+(1.5*iqr)

potential_outliers = df_currency[(df_currency["Open"] < lower_bound) | (df_currency["Open"] > upper_bound)]
potential_outliers

Unnamed: 0,Date,Open,High,Low,Close,Volume,Index
3050,2015-08-27,72227.0,72615.0,7204.5,7204.5,0,GNF


In [144]:
# Remove Outliers
df_currency = df_currency.drop(potential_outliers.index)

# Remove NAs
df_currency.dropna(inplace=True)

# Removing Duplicates
df_currency = df_currency.drop_duplicates()

# Remove Catagorical Columns
df_currency.drop(columns='Index', inplace=True)

# Convert the 'Date' column to datetime format
df_currency['Date'] = pd.to_datetime(df_currency['Date'])

# Calculate the number of days since the start date
start_date = df_currency['Date'].min()
df_currency['Days'] = (df_currency['Date'] - start_date).dt.days

## Visualization

In [145]:
# Line chart of the currency exchange rate
df_currency.hvplot.line(
    x='Date',
    y= 'Open',
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Time'
)

## Data Preparation

In [146]:
# Reformat data of the independent variable X as a single-column array
X = df_currency['Days'].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[0],
       [1],
       [2],
       [3],
       [4]], dtype=int64)

In [147]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(5206, 1)

In [148]:
# Create an array for the dependent variable y
y = df_currency["Close"]

## Building the Linear Regression Model

In [149]:
# Create a model with scikit-learn
model = LinearRegression()

In [150]:
# Fit the data into the model
model.fit(X, y)

In [151]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.96478236]


In [152]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 3260.6417775501363


In [153]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 3260.6417775501363 + 0.9647823624705313X


In [154]:
# Predict the currency for a future date
future_date = pd.to_datetime('2024-01-01')  # Change this date to your desired future date
future_days = (future_date - start_date).days  # Calculate days since start date for the future date
predicted_currency = model.predict([[future_days]])  # Predict for the future date

# Display the prediction for the future date
print(f"Predicted closing currency on {future_date} is: {predicted_currency[0]:.2f}")

Predicted closing currency on 2024-01-01 00:00:00 is: 10338.29


In [155]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [156]:
# Create a copy of the original data
df_currency_predicted = df_currency.copy()

# Add a column with the predicted currency values
df_currency_predicted["currency_predicted"] = predicted_y_values

# Display sample data
df_currency_predicted.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Days,currency_predicted
0,2003-12-01,1954.099976,1954.099976,1954.099976,1954.099976,0,0,3260.641778
1,2003-12-02,1951.400024,1951.400024,1951.400024,1951.400024,0,1,3261.60656
2,2003-12-03,1954.0,1954.0,1954.0,1954.0,0,2,3262.571342
3,2003-12-04,1952.800049,1952.800049,1952.800049,1952.800049,0,3,3263.536125
4,2003-12-05,1953.400024,1953.400024,1953.400024,1953.400024,0,4,3264.500907


In [157]:
# Create a line plot of open versus the predicted currency values
best_fit_line = df_currency_predicted.hvplot.line(
    x='Date',
    y='currency_predicted',
    xlabel='Date',
    ylabel='Predicted Currency',
    title='Predicted Currency Vs. Time'
)
best_fit_line

In [158]:
# Superpose the original data and the best fit line
# Create a line plot of 'Open' versus the predicted currency values
df_currency_predicted.hvplot.line(
    x='Date',
    y=['Open', 'currency_predicted'], 
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Predicted Currency'
)

## Linear Regression Model Assessment

In [159]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y)
predicted_y_values = model.predict(X)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.8438147750799011.
The r2 is 0.8438147750799011.
The mean squared error is 765161.3285168055.
The root mean squared error is 874.73500473961.
The standard deviation is 2213.382809357908.
