In [None]:
#Forex Prediction Data Analysis
"""
OBJECTIVES
1.Load and process the data set
2.Inspect the data for empty values, wrong data, types, wrong formats, duplicates and outliners
3.Analyze the data to find patterns and relationships (correlations)
4.Visualize the data(graphs) to understand trends and distributions [in EUR/USD exchange rate movements for our dataset]
5.Evaluate the model performance using appropriate metrics.
6.Save the cleaned and processed data for future use.
"""

In [12]:
#Import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

df = pd.read_csv('forex_predictions_data.csv')
#first 5 rows
df.head()




Unnamed: 0,Date,Open,High,Low,Close,Volume,Predicted_Close,Currency_Pair,Signal,Confidence
0,2024-01-01,1.18727,1.92461,0.85312,1.18154,2201,1.22984,EUR/USD,Hold,0.9
1,2024-01-02,1.47536,1.82881,0.54067,1.32296,error,1.03797,EUR/USD,Sell,
2,2024-01-03,1.366,1.78415,0.54242,1.28539,4420,1.03888,EUR/USD,Sell,
3,2024-01-04,1.29933,1.54684,0.99332,1.17805,4079,1.00117,EUR/USD,Sell,0.64
4,2024-01-05,1.07801,1.68386,0.68714,,1832,1.48385,EUR/USD,Sell,0.68


In [11]:
#the statistics
df.describe()

Unnamed: 0,Open,High,Low,Close,Predicted_Close,Confidence
count,224.0,220.0,225.0,212.0,222.0,218.0
mean,1.239946,1.753113,0.75003,1.245072,1.250415,0.756468
std,0.148956,0.147816,0.150434,0.140594,0.156102,0.135125
min,1.00253,1.50542,0.50568,1.00232,1.00012,0.5
25%,1.11186,1.638832,0.61432,1.122535,1.11528,0.6525
50%,1.248105,1.7634,0.74631,1.23591,1.259605,0.76
75%,1.3651,1.877682,0.87757,1.368805,1.392158,0.87
max,1.49344,1.99525,0.99986,1.49844,1.49968,1.0


In [10]:
#the shape of the data-number of rows and columns
df.shape

(229, 10)

In [13]:
#check for missing values or null values
print(df.isnull().sum())

Date                3
Open                5
High                9
Low                 4
Close              17
Volume              3
Predicted_Close     7
Currency_Pair       0
Signal              2
Confidence         11
dtype: int64


In [None]:
#handling the missing values

#fill empty values with median, for numeric columns

#an array of numeric columns
numeric_cols = ['Open','High','Low','Close','Volume','Predicted_Close','Confidence']
for col in numeric_cols:
    if col in df.columns:
        
        #this makes it easy to get the median for the cells with numbers and fill the empty ones with a value.
        df[col] = pd.to_numeric(df[col],errors='coerce') #Convert to numeric,and anywhere else without a number coerce it with NaN
        
        #fillna with fillna using median
        df[col] = df[col].fillna(df[col].median())
        


#for values of a column in categories or [non-numeric],use can use the MODAL VALUE for empty cells.
categorical_cols =['Signal']
for col in categorical_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col],errors='coerce')
        #fill NaN with mode
        # Check if mode exists before filling NaN to avoid KeyError
        mode_series = df[col].mode()
        if not mode_series.empty:
            df[col] = df[col].fillna(mode_series[0])
        else:
            # If mode is empty, fill with a placeholder or leave as NaN
            df[col] = df[col]
        
#check again for missing values.
print("Missing values after handling :")
print(df.isnull().sum())


Missing values after handling :
Date                 1
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Predicted_Close      0
Currency_Pair        0
Signal             217
Confidence           0
dtype: int64


In [21]:
#data in wrong format

#Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

#handle volume column with errors
#1.First convert to numeric
df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
#2.fill all the NaN with the median
df['Volume'] = df['Volume'].fillna(df['Volume'].median())

#handle confidence column
df['Confidence'] = pd.to_numeric(df['Confidence'], errors='coerce')
#2.fill all the NaN with the median
df['Confidence'] = df['Confidence'].fillna(df['Confidence'].median())



In [None]:
#Wrong data types
price_cols =['Open','High','Low','Close','Predicted_Close']

#deviations beyond +3 are outliers - use standard devition to remove them.
for col in price_cols:
    if col in df.columns:
        #remove outliers (beyond 3 standard deviations)
        mean = df[col].mean()
        std = df[col].std()
        df[col] =np.where((df[col]<mean-3*std) | (df[col]>mean+3*std),np.nan, df[col])
        #after removing the outliers fill them with median
        df[col] = df[col].fillna(df[col].median())
      

In [None]:
#Check for duplicates - Number of duplicate rows
print("Number of duplicate rows: ",df.duplicated().sum())

Number of duplicate rows:  10


In [24]:
#remove duplicates - use drop
df =df.drop_duplicates()
print("Number of rows after removing duplicates:",df.shape[0])

Number of rows after removing duplicates: 219


In [25]:
#Duplicate dates are handled differently
print("Number of duplicate dates:",df['Date'].duplicated().sum())
#Remove duplicate dates
df =df.drop_duplicates(subset=['Date'], keep ='first')
#shape
df.shape

Number of duplicate dates: 2


(217, 10)

In [None]:
#correlation analysis
# calculate correlation matrix
corr_matrix = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Predicted_Close', 'Confidence']].corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, center=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Actual vs Predicted Close Prices
plt.figure(figsize=(14, 7))
plt.plot(df['Date'], df['Close'], label='Actual Close Price', color='blue')
plt.plot(df['Date'], df['Predicted_Close'], label='Predicted Close Price', color='orange')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Actual vs Predicted Close Prices')
plt.legend()
plt.grid()
plt.show()

In [None]:
#Signal Performance Analysis
df['Prediction_Error'] = abs(df['Close'] - df['Predicted_Close'])

# Analyze error by signal type
signal_error = df.groupby('Signal')['Prediction_Error'].mean()
signal_confidence = df.groupby('Signal')['Confidence'].mean()
# Plotting the average prediction error by signal type
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
signal_error.plot(kind='bar', color='skyblue')
plt.title('Average Prediction Error by Signal Type')
plt.xlabel('Signal Type')

# Plotting the average confidence by signal type
plt.subplot(1, 2, 2)
signal_confidence.plot(kind='bar', color='lightgreen')
plt.title('Average Confidence by Signal Type')
plt.xlabel('Signal Type')

In [None]:
#Volatility analysis - shows when it is high and when it low to know when to trade.
#Volatility - measure of how prices keep changing
#we can look at daily returns/range
# Calculate daily returns
df['Daily_Range'] = df['High']-df['Low']
# Plot volatility over time
plt.figure(figsize=(14, 7))
plt.plot(df['Date'], df['Daily_Range'], label='Daily Range', color='purple')
plt.xlabel('Date')
plt.ylabel('Daily Range')
plt.title('Volatility Over Time')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Key findings
# 1. The dataset has been cleaned and preprocessed, with missing values handled and outliers removed.
# 2. The correlation analysis shows strong relationships between the price columns, particularly between 'Open', 'High', 'Low', and 'Close'.
# 3. The actual vs predicted close prices plot indicates that the model performs reasonably well, but there are some discrepancies.
# 4. The signal performance analysis shows that the average prediction error varies by signal type, with 'Buy' signals generally having lower errors.
# 5. The volatility analysis indicates fluctuations in the daily range, which can be useful for understanding market behavior.
# 6. The cleaned and processed data is ready for further analysis or modeling.
# Save the cleaned and processed data for future use


# Price correlation analysis

# Recemmendations for future work
# 1. Model Improvement: Explore more advanced machine learning models or deep learning techniques to improve prediction accuracy.
# 2. Feature Engineering: Create additional features that may capture market dynamics better, such as technical indicators (e.g., moving averages, RSI).
# 3. Time Series Analysis: Implement time series analysis techniques to capture trends and seasonality in the data.
# 4. Backtesting: Implement a backtesting framework to evaluate the performance of trading strategies based on the predictions.
# 5. Real-time Data Integration: Consider integrating real-time data feeds to make predictions on live market conditions.
# 6. Model Deployment