Model Training 'N' Finding the predictions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
df = pd.read_csv('gld_price_data.csv')
df.head()

Unnamed: 0,Date,SPX,GLD,USO,SLV,EUR/USD
0,1/2/2008,1447.160034,84.860001,78.470001,15.18,1.471692
1,1/3/2008,1447.160034,85.57,78.370003,15.285,1.474491
2,1/4/2008,1411.630005,85.129997,77.309998,15.167,1.475492
3,1/7/2008,1416.180054,84.769997,75.5,15.053,1.468299
4,1/8/2008,1390.189941,86.779999,76.059998,15.59,1.557099


We need to drop the 'Date' column

In [3]:
df = df.drop(['Date'] , axis = 1)
df.head()

Unnamed: 0,SPX,GLD,USO,SLV,EUR/USD
0,1447.160034,84.860001,78.470001,15.18,1.471692
1,1447.160034,85.57,78.370003,15.285,1.474491
2,1411.630005,85.129997,77.309998,15.167,1.475492
3,1416.180054,84.769997,75.5,15.053,1.468299
4,1390.189941,86.779999,76.059998,15.59,1.557099


Dividing into independent and dependent features

In [4]:
X = df.iloc[:,:-1]            ##independent features
y = df.iloc[:,-1]             ##dependent features

In [5]:
X.head()

Unnamed: 0,SPX,GLD,USO,SLV
0,1447.160034,84.860001,78.470001,15.18
1,1447.160034,85.57,78.370003,15.285
2,1411.630005,85.129997,77.309998,15.167
3,1416.180054,84.769997,75.5,15.053
4,1390.189941,86.779999,76.059998,15.59


In [6]:
y.head()

0    1.471692
1    1.474491
2    1.475492
3    1.468299
4    1.557099
Name: EUR/USD, dtype: float64

Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.25 , random_state=42)

In [8]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((1717, 4), (573, 4), (1717,), (573,))

Standardisation

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
X_train_scaled

array([[-1.48275634, -1.58912267, -0.08195715, -1.30239164],
       [ 0.39586716, -0.59033989, -0.97983242, -0.85830091],
       [-0.04092147, -0.14591173,  0.23720737, -0.23769461],
       ...,
       [-0.30635955,  1.27582894,  0.08404949,  1.06655924],
       [ 0.27976195, -0.14805886,  0.07171457, -0.16204523],
       [-0.77056887,  1.99636086,  0.36055604,  1.61011523]],
      shape=(1717, 4))

Model Training

In [22]:
##Clearly this is a regression problem
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [28]:
##performance metrics
from sklearn.metrics import r2_score

In [29]:
linear_regression = LinearRegression()
svr = SVR(kernel = 'rbf')
Decision_tree = DecisionTreeRegressor()
Random_forest = RandomForestRegressor()

In [30]:
models = [
    linear_regression,
    svr,
    Decision_tree,
    Random_forest

]


def compare_models():
    for model in models:
        model.fit(X_train_scaled , y_train)
        predictions = model.predict(X_test_scaled)
        score  = r2_score(predictions , y_test)
        print(f"For {model} score is {score}")

compare_models()


For LinearRegression() score is 0.6594962053657077
For SVR() score is 0.7507542646189729
For DecisionTreeRegressor() score is 0.9749421903857418
For RandomForestRegressor() score is 0.982148354110982


Clearly the  R-squared value of Random Forest Regressor is highest

Model to predict Gold price


In [36]:
input = (1330.609985,90.300003,71.910004,16.298)         ##input taken from the raw CSV file

##Converting the input to numpy array
input_numpy = np.asarray(input)

##Reshaping the array
input_reshaped = input_numpy.reshape(1,-1)

##Find the prediction
prediction = Random_forest.predict(input_reshaped)

print(prediction)

[1.54823856]
