In [75]:
import numpy as np 
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt

!pip install xgboost
!pip install eli5

import warnings
warnings.filterwarnings("ignore") 
sns.set_style("darkgrid", {"grid.color": ".6","grid.linestyle": ":"})

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline 

Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 2.8 MB/s eta 0:00:54
   ---------------------------------------- 1.0/150.0 MB 2.4 MB/s eta 0:01:03
   ---------------------------------------- 1.8/150.0 MB 2.8 MB/s eta 0:00:54
    --------------------------------------- 2.6/150.0 MB 3.1 MB/s eta 0:00:48
    --------------------------------------- 3.7/150.0 MB 3.4 MB/s eta 0:00:44
   - -------------------------------------- 4.5/150.0 MB 3.5 MB/s eta 0:00:42
   - -------------------------------------- 5.2/150.0 MB 3.6 MB/s eta 0:00:40
   - -------------------------------------- 6.3/150.0 MB 3.8 MB/s eta 0:00:38
   - -------------------------------------- 7.3/150.0 MB 3.8 MB/s eta 0:00:38
   -- -

In [76]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import GridSearchCV

In [None]:
# read dataset using pandas function
# use parse_dates argument to change datetime dtype
data = pd.read_csv("C:/Users/upadh/Downloads/gold_price_data.csv")

In [None]:
# information about the dataset
data.info()

In [None]:
# Missing Values/Null Values Count
data.isna().sum().sort_values(ascending=False)

In [None]:
# Calculate correlation matrix
correlation = data.select_dtypes(include='number').corr()

# Create heatmap
sns.heatmap(correlation, cmap='coolwarm',center=0, annot=True)


# Set title and axis labels
plt.title('Correlation Matrix Heatmap')
plt.xlabel('Features')
plt.ylabel('Features')

# Show plot
plt.show()

In [None]:
# drop SlV column
data.drop("SLV", axis=1,inplace=True)

In [None]:
# plot price of gold for each increasing day
data["EUR/USD"].plot()
plt.title("Change in price of gold through date")
plt.xlabel("date")
plt.ylabel("price")
plt.show()

In [None]:
# Apply rolling mean
data["price_trend"] = data["EUR/USD"] / data["EUR/USD"].rolling(window=20).mean()

# Reset index (if 'date' was the index)
data.reset_index(inplace=True)

# Plotting price trend after removing first 20 NaNs
data.loc[20:, "price_trend"].plot()

# Set title and labels
plt.title("Trend in price of EUR/USD through date")
plt.xlabel("date")
plt.ylabel("price_trend")
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 8))

# suptitle of the graph
fig.suptitle('Distribution of data across column')
temp = data.drop("Date", axis=1).columns.tolist()
for i, item in enumerate(temp):
    plt.subplot(2, 3, i+1)
    sns.histplot(data=data, x=item, kde=True)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=2.0)
plt.show()

In [None]:
# skewness along the index axis
print(data.drop("Date", axis=1).skew(axis=0, skipna=True))

# This code is modified by shreya

In [None]:
# apply saquare root transformation
# on the skewed dataset
data["USO"] = data["USO"] / np.sqrt(data["USO"])

In [None]:
fig = plt.figure(figsize=(8, 8))
temp = data.drop("Date", axis=1).columns.tolist()

for i, item in enumerate(temp):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(data=data, x=item, color='violet')

plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=2.0)
plt.show()

In [None]:
def outlier_removal(column):
    # Capping the outlier rows with Percentiles
    upper_limit = column.quantile(.95)
    # set upper limit to 95percentile
    lower_limit = column.quantile(.05)
    # set lower limit to 5 percentile
    column.loc[(column > upper_limit)] = upper_limit
    column.loc[(column < lower_limit)] = lower_limit
    return column

In [None]:
# Normalize outliers in columns except Date

data[['SPX', 'GLD', 'USO', 'EUR/USD']] = \
    data[['SPX', 'GLD', 'USO', 'EUR/USD']].apply(outlier_removal)

In [None]:
# MODELING THE DATA
# select the features and target variable
X = data.drop(['Date', 'EUR/USD'], axis=1)

y = data['EUR/USD']
# dividing dataset in to train test
x_train, x_test,\
    y_train, y_test = train_test_split(X, y,
                                       test_size=0.2)

In [None]:
#SCALING THE DATA
# Create an instance of the StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler on the training dataset
scaler.fit(x_train)

# Transform the training dataset
# using the StandardScaler
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
#LASSO REGRESSION

# Impute missing values using SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean') # Replace NaNs with the mean of each column

# Fit and transform the imputer on the scaled training data
x_train_scaled = imputer.fit_transform(x_train_scaled)

# Transform the scaled test data using the trained imputer
x_test_scaled = imputer.transform(x_test_scaled)

# Create a PolynomialFeatures object of degree 2
poly = PolynomialFeatures(degree=2)

# Create a Lasso object
lasso = Lasso()

# Define a dictionary of parameter
#values to search over
param_grid = {'lasso__alpha': [1e-4, 1e-3, 1e-2,
							1e-1, 1, 5, 10, 
							20, 30, 40]}

# Create a pipeline that first applies 
# polynomial features and then applies Lasso regression
pipeline = make_pipeline(poly, lasso)

# Create a GridSearchCV object with 
#the pipeline and parameter grid
lasso_grid_search = GridSearchCV(pipeline,param_grid, scoring='r2', cv=3)

# Fit the GridSearchCV object to the training data
lasso_grid_search.fit(x_train_scaled, y_train)

# Predict the target variable using
# the fitted model and the test data
y_pred = lasso_grid_search.predict(x_train_scaled)

# Compute the R-squared of the fitted model on the train data
r2 = r2_score(y_train, y_pred)

# Print the R-squared
print("R-squared: ", r2)

# Print the best parameter values and score
print('Best parameter values: ',lasso_grid_search.best_params_)
print('Best score: ',lasso_grid_search.best_score_)

In [None]:
# RANDOM FOREST REGRESSOR FOR REGRESSION

# Insiate param grid for which to search
param_grid = {'n_estimators': [50, 80, 100],
              'max_depth': [3, 5, 7]}

# create instance of the Randomforest regressor
rf = RandomForestRegressor()

# Define Girdsearch with random forest
# object parameter grid scoring and cv
rf_grid_search = GridSearchCV(rf, param_grid, scoring='r2', cv=2)

# Fit the GridSearchCV object to the training data

rf_grid_search.fit(x_train_scaled, y_train)

# Print the best parameter values and score
print('Best parameter values: ', rf_grid_search.best_params_)
print('Best score: ', rf_grid_search.best_score_)

In [None]:
# Compute the R-squared of the
# fitted model on the test data
r2 = r2_score(y_test,
              rf_grid_search.predict(x_test_scaled))

# Print the R-squared
print("R-squared:", r2)

In [None]:
features = data.drop("Date", axis=1).columns

# store the importance of the feature
importances = rf_grid_search.best_estimator_.\
    feature_importances_


indices = np.argsort(importances)

# title of the graph
plt.title('Feature Importance')

plt.barh(range(len(indices)),
         importances[indices],
         color='red',
         align='center')

# plot bar chart
plt.yticks(range(len(indices)),
           [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
#XGBoost Model for Regression 
from xgboost import XGBRegressor

# Create an instance of the XGBRegressor model
model_xgb = XGBRegressor()

# Fit the model to the training data
model_xgb.fit(x_train_scaled, y_train)

# Predict on training data
y_train_pred = model_xgb.predict(x_train_scaled)

# Print the R-squared score on the training data
print("XGBoost R^2 Score on Training Data =", r2_score(y_train, y_train_pred))

In [None]:
# Print the R-squared score on the test data
print("Xgboost Accuracy on test data =",
      r2_score(y_test, model_xgb.predict(x_test_scaled)))

In [None]:
# Model Explainability 
import eli5
from eli5.sklearn import PermutationImportance  # Optional, for more detailed insights

# Make sure your model is using the scikit-learn API
# For example: model_xgb = xgb.XGBClassifier().fit(x_train, y_train)

# Explain weights (feature importances)
eli5.show_weights(model_xgb, feature_names=x_train.columns.tolist())

In [None]:
#Model Deloyment using Pickle

# dump model using pickle library
import pickle

# dump model in file model.pkl
with open('model.pkl', 'wb') as file:
pickle.dump(model_xgb,file)  