In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
import os

In [None]:
GOLD_PATH = os.path.join("datasets", "gold")

def load_gold_data(gold_path=GOLD_PATH):
    csv_path = os.path.join(gold_path,"Gold_Yearly.csv")
    return pd.read_csv(csv_path)

In [None]:
gold = load_gold_data()

gold.head()

In [None]:
gold.info()

In [None]:
gold.describe().T

In [None]:
colours = ["#f7b2b0", "#8f7198", "#003f5c"]
sns.countplot(data=gold, x="Year Close", palette=colours)

In [None]:
corrmat = gold.corr()
plt.figure(figsize=(15, 15))

cmap = sns.diverging_palette(250, 10, s=80, l=55, n=9, as_cmap=True)

sns.heatmap(corrmat, annot=True, cmap=cmap, center=0)

In [None]:
cols = ['Year Open', 'Year High', 'Year Low', 'Year Close']
for i in cols:
    sns.stripplot(x=gold["Year"], y=gold[i], color="black", alpha=0.25)
    sns.boxenplot(x=gold["Year"], y=gold[i], palette=colours)
    plt.show()


In [None]:
shades = ["#f7b2b0", "#c98ea6", "#8f7198", "#50587f", "#003f5c"]
plt.figure(figsize=(20, 10))
sns.boxenplot(data=gold, palette=shades)
plt.xticks(rotation=90)
plt.show()

In [None]:
X = gold.drop(["Year High"], axis=1)
y = gold["Year High"]

#Set up a standard scaler for the features
col_names = list(X.columns)
s_scaler = preprocessing.StandardScaler()
X_df = s_scaler.fit_transform(X)
X_df = pd.DataFrame(X_df, columns=col_names)
X_df.describe().T

In [None]:
plt.figure(figsize=(20, 10))
sns.boxenplot(data=X_df, palette=shades)
plt.xticks(rotation=90)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

pipeline_rf = Pipeline([('rf_classifier', RandomForestRegressor())])

pipeline_rf.fit(X_train, y_train)

cv_score = cross_val_score(pipeline_rf, X_train, y_train, cv=10)
print("RandomForest: %f " % (cv_score.mean()))


In [None]:
parameters = {
    'n_estimators': [100, 150, 200, 500, 700, 900],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 12, 14, 16],
    'criterion': ['gini', 'entropy'],
    'n_jobs': [-1, 1, None]
}

#Fitting the trainingset to find parameters with best accuracy

CV_rfc = GridSearchCV(estimator=RandomForestRegressor(), param_grid=parameters, cv=5)
CV_rfc.fit(X_train, y_train)

#Getting the outcome of gridsearch

CV_rfc.best_params_

In [None]:
RF_model = RandomForestRegressor(**CV_rfc.best_params_)
RF_model.fit(X_train, y_train)
#Testing the Model on test set
predictions = RF_model.predict(X_test)
acccuracy = accuracy_score(y_test, predictions)
acccuracy

In [None]:
acccuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions, average="weighted")
precision = precision_score(y_test, predictions, average="weighted")
f1_score = f1_score(y_test, predictions, average="micro")

print("********* Random Forest Results *********")
print("Accuracy    : ", acccuracy)
print("Recall      : ", recall)
print("Precision   : ", precision)
print("F1 Score    : ", f1_score)

In [None]:
print(classification_report(y_test, predictions))