In [1]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression

## Data Loading

In [2]:
directory = 'Resources/Price-Data'
all_files = os.listdir(directory)
df_list = []
for file in all_files:
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory, file))
        # Extract the first three letters of the filename
        file_prefix = file[:3]
        # Create a new column with the file prefix value for each row
        df['Index'] = file_prefix
        df_list.append(df)
merged_df = pd.concat(df_list)
merged_df.to_csv('Resources/merged_df.csv', index=False)
merged_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Index
0,2003-12-01,3.672000,3.672000,3.671500,3.671500,0,AED
1,2003-12-02,3.670900,3.672000,3.670800,3.671500,0,AED
2,2003-12-03,3.671900,3.672000,3.671500,3.671500,0,AED
3,2003-12-04,3.671800,3.672000,3.671500,3.671500,0,AED
4,2003-12-05,3.671900,3.672000,3.671800,3.671800,0,AED
...,...,...,...,...,...,...,...
2842,2023-12-01,23.698650,23.712244,23.698650,23.681280,0,ZMW
2843,2023-12-04,23.759245,23.791113,23.758589,23.720627,0,ZMW
2844,2023-12-05,23.843790,23.926725,23.843790,23.791113,0,ZMW
2845,2023-12-06,23.988028,24.016075,23.988028,23.926725,0,ZMW


## Select Currency

In [3]:
# Select currency index from merged_df
index_value = 'EUR' 

# Filter the DataFrame for rows with the specific index value
df_currency = merged_df[merged_df['Index'] == index_value]

df_currency.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Index
5190,2023-12-01,0.9181,0.92323,0.9163,0.9181,0,EUR
5191,2023-12-04,0.91843,0.92544,0.9185,0.91843,0,EUR
5192,2023-12-05,0.9227,0.92742,0.9218,0.9227,0,EUR
5193,2023-12-06,0.92647,0.92823,0.92561,0.92647,0,EUR
5194,2023-12-07,0.9284,0.9294,0.927,0.928,0,EUR


## Data Cleaning

In [4]:
# Remove NAs
df_currency.dropna(inplace=True)

# Removing Duplicates
df_currency = df_currency.drop_duplicates()

# Remove Catagorical Columns
df_currency.drop(columns='Index', inplace=True)

# Convert the 'Date' column to datetime format
df_currency['Date'] = pd.to_datetime(df_currency['Date'])

# Calculate the number of days since the start date
start_date = df_currency['Date'].min()
df_currency['Days'] = (df_currency['Date'] - start_date).dt.days

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_currency.dropna(inplace=True)


## Visualization

In [5]:
# Line chart of the currency exchange rate
df_currency.hvplot.line(
    x='Date',
    y= 'Open',
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Time'
)

## Data Preparation

In [6]:
# Reformat data of the independent variable X as a single-column array
X = df_currency['Days'].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[0],
       [1],
       [2],
       [3],
       [4]], dtype=int64)

In [7]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(5195, 1)

In [8]:
# Create an array for the dependent variable y
y = df_currency["Close"]

## Building the Linear Regression Model

In [9]:
# Create a model with scikit-learn
model = LinearRegression()

In [10]:
# Fit the data into the model
model.fit(X, y)

In [11]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [2.645008e-05]


In [12]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 0.7197308385173559


In [13]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 0.7197308385173559 + 2.6450080039475924e-05X


In [30]:
# Predict the currency for a future date
future_date = pd.to_datetime('2024-01-01')  # Change this date to your desired future date
future_days = (future_date - start_date).days  # Calculate days since start date for the future date
linear_predicted_currency = model.predict([[future_days]])  # Predict for the future date

# Display the prediction for the future date
print(f"Predicted closing currency on {future_date} is: {linear_predicted_currency[0]:.2f}")

Predicted closing currency on 2024-01-01 00:00:00 is: 0.91


In [15]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [16]:
# Create a copy of the original data
df_currency_predicted = df_currency.copy()

# Add a column with the predicted currency values
df_currency_predicted["currency_predicted"] = predicted_y_values

# Display sample data
df_currency_predicted.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Days,currency_predicted
0,2003-12-01,0.83098,0.83724,0.83056,0.83577,0,0,0.719731
1,2003-12-02,0.83605,0.8371,0.82583,0.8272,0,1,0.719757
2,2003-12-03,0.82713,0.82802,0.8244,0.82488,0,2,0.719784
3,2003-12-04,0.82508,0.83029,0.82345,0.82775,0,3,0.71981
4,2003-12-05,0.82795,0.82878,0.82028,0.82055,0,4,0.719837


In [17]:
# Create a line plot of open versus the predicted currency values
best_fit_line = df_currency_predicted.hvplot.line(
    x='Date',
    y='currency_predicted',
    xlabel='Date',
    ylabel='Predicted Currency',
    title='Predicted Currency Vs. Time'
)
best_fit_line

In [18]:
# Superpose the original data and the best fit line
# Create a line plot of 'Open' versus the predicted currency values
linear_regression = df_currency_predicted.hvplot.line(
    x='Date',
    y=['Open', 'currency_predicted'], 
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Predicted Currency'
)
linear_regression

## Linear Regression Model Assessment

In [19]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y)
predicted_y_values = model.predict(X)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.46260685226816245.
The r2 is 0.46260685226816245.
The mean squared error is 0.0036246942510347853.
The root mean squared error is 0.060205433733466165.
The standard deviation is 0.08212769371297107.


# Polynomial Regression

## Building the Polynomial Regression Model

In [20]:
from sklearn.preprocessing import PolynomialFeatures

# Define the degree of the polynomial
degree = 2  # You can change this to the degree you want for the polynomial

# Create polynomial features
poly_features = PolynomialFeatures(degree=degree)
X_poly = poly_features.fit_transform(X)

In [21]:
# Create a polynomial regression model
poly_model = LinearRegression()
poly_model.fit(X_poly, y)

In [22]:
# Predict the currency for a future date using polynomial regression
future_days_poly = poly_features.transform([[future_days]])
predicted_currency_poly = poly_model.predict(future_days_poly)
# Display the prediction for the future date using polynomial regression
print(f"Predicted closing currency on {future_date} using polynomial regression is: {predicted_currency_poly[0]:.2f}")

Predicted closing currency on 2024-01-01 00:00:00 using polynomial regression is: 0.97


In [23]:
# Make predictions using the X set for polynomial regression
predicted_y_values_poly = poly_model.predict(X_poly)

In [24]:
# Create a copy of the original data for polynomial regression
df_currency_predicted_poly = df_currency.copy()

# Add a column with the predicted currency values from polynomial regression
df_currency_predicted_poly["currency_predicted_poly"] = predicted_y_values_poly

# Display sample data
df_currency_predicted_poly.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Days,currency_predicted_poly
0,2003-12-01,0.83098,0.83724,0.83056,0.83577,0,0,0.775149
1,2003-12-02,0.83605,0.8371,0.82583,0.8272,0,1,0.77513
2,2003-12-03,0.82713,0.82802,0.8244,0.82488,0,2,0.775111
3,2003-12-04,0.82508,0.83029,0.82345,0.82775,0,3,0.775092
4,2003-12-05,0.82795,0.82878,0.82028,0.82055,0,4,0.775073


In [25]:
# Create a line plot of open versus the predicted currency values using polynomial regression
best_fit_line_poly = df_currency_predicted_poly.hvplot.line(
    x='Date',
    y='currency_predicted_poly',
    xlabel='Date',
    ylabel='Predicted Currency (Polynomial Regression)',
    title='Predicted Currency Vs. Time (Polynomial Regression)'
)
best_fit_line_poly

In [26]:
# Superpose the original data and the best fit line using polynomial regression
poly_regression = df_currency_predicted_poly.hvplot.line(
    x='Date',
    y=['Open', 'currency_predicted_poly'],
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Predicted Currency (Polynomial Regression)'
)
poly_regression

## Polynomial Regression Model Assessment

In [27]:
# Compute metrics for the polynomial regression model: score, r2, mse, rmse, std
score_poly = poly_model.score(X_poly, y)
predicted_y_values_poly = poly_model.predict(X_poly)
r2_poly = r2_score(y, predicted_y_values_poly)
mse_poly = mean_squared_error(y, predicted_y_values_poly)
rmse_poly = np.sqrt(mse_poly)
std_poly = np.std(y)

# Print relevant metrics for polynomial regression
print(f"The score for polynomial regression is {score_poly}.")
print(f"The r2 for polynomial regression is {r2_poly}.")
print(f"The mean squared error for polynomial regression is {mse_poly}.")
print(f"The root mean squared error for polynomial regression is {rmse_poly}.")


The score for polynomial regression is 0.5542577077128357.
The r2 for polynomial regression is 0.5542577077128357.
The mean squared error for polynomial regression is 0.003006513073558179.
The root mean squared error for polynomial regression is 0.05483167947052305.


# Comparisons

## Assessment Comparison

In [28]:
# Create DataFrames to store results
linear_results = pd.DataFrame(columns=['Model', 'Score', 'R2', 'MSE', 'RMSE', 'STD'])
poly_results = pd.DataFrame(columns=['Model', 'Score', 'R2', 'MSE', 'RMSE', 'STD'])

# Function to calculate metrics and append results to the DataFrames
def append_results(model, X_data, y_data, model_type, results_df):
    score = model.score(X_data, y_data)
    predicted_y_values = model.predict(X_data)
    r2 = r2_score(y_data, predicted_y_values)
    mse = mean_squared_error(y_data, predicted_y_values)
    rmse = np.sqrt(mse)
    std = np.std(y_data)
    
    results_df = results_df.append({
        'Model': model_type,
        'Score': score,
        'R2': r2,
        'MSE': mse,
        'RMSE': rmse,
        'STD': std
    }, ignore_index=True)
    
    return results_df

# Calculate metrics for linear regression
linear_results = append_results(model, X, y, 'Linear', linear_results)

# Calculate metrics for polynomial regression
poly_results = append_results(poly_model, X_poly, y, 'Polynomial', poly_results)

# Concatenate results from both models into one DataFrame
all_results = pd.concat([linear_results, poly_results])

# Display the comparison DataFrame
all_results

  results_df = results_df.append({
  results_df = results_df.append({


Unnamed: 0,Model,Score,R2,MSE,RMSE,STD
0,Linear,0.462607,0.462607,0.003625,0.060205,0.082128
0,Polynomial,0.554258,0.554258,0.003007,0.054832,0.082128


## Prediction Comparison

In [31]:
# Define the future date
future_date = pd.to_datetime('2024-01-01')
future_days = (future_date - start_date).days

# Predict the currency for the future date using linear regression
linear_predicted_currency = model.predict([[future_days]])

# Predict the currency for the future date using polynomial regression
future_days_poly = poly_features.transform([[future_days]])
predicted_currency_poly = poly_model.predict(future_days_poly)

# Create a DataFrame for the predicted values
predicted_values_df = pd.DataFrame({
    'Date': [future_date, future_date],
    'Model': ['Linear Regression', 'Polynomial Regression'],
    'Predicted Currency': [linear_predicted_currency[0], predicted_currency_poly[0]]
})

# Display the DataFrame
predicted_values_df

Unnamed: 0,Date,Model,Predicted Currency
0,2024-01-01,Linear Regression,0.913769
1,2024-01-01,Polynomial Regression,0.970306


## Regression Line Comparison

In [29]:
import matplotlib.pyplot as plt

# Create plots for linear regression
linear_regression = df_currency_predicted.hvplot.line(
    x='Date',
    y=['Open', 'currency_predicted'], 
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Predicted Currency (Linear Regression)'
)

# Create plots for polynomial regression
poly_regression = df_currency_predicted_poly.hvplot.line(
    x='Date',
    y=['Open', 'currency_predicted_poly'],
    xlabel='Date',
    ylabel='Currency',
    title='Currency vs Predicted Currency (Polynomial Regression)'
)

# Combine both plots side by side
combined_plots = linear_regression + poly_regression

# Display the combined plot
combined_plots.cols(1)
