# Import modules

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import os
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Import data

In [None]:
data = pd.read_csv("data/price.csv")
data["Date"] = pd.to_datetime(data["Date"])
data = data.set_index("Date")

# make column names lower case as some features have the same names as stocks
data.columns = data.columns.str.lower()
data=(data-data.mean())/data.std()
data

In [None]:
cats = pd.read_csv("data/categorical.csv")
cats = cats.set_index("Unnamed: 0")
cats = cats.rename_axis(None, axis = 0)
cats

In [None]:
features = pd.read_csv("data/features.csv")
features["Date"] = pd.to_datetime(features["Date"])
features = features.set_index("Date")

# Subtract 1 year from index of features since we want to see the correlation between price the feature 1 year ago
features.index = [index - dt.timedelta(days=365) for index in features.index]
features

In [None]:
try:
    os.makedirs("Data exploration")
except FileExistsError:
    pass

# R2 scores

In [None]:
MAX_DEGREE = 5

# Create column names for dataframe
column_names = ["features"]
for degree in range(1,MAX_DEGREE+1):
    column_names.append(f"r2_d{degree}")

r2_table = []
features_tqdm = tqdm(features.columns, total=len(features.columns), desc="R2 for features")
for feature in features_tqdm:
    combined = pd.concat([features[[feature]], data], axis="columns", join="inner")
    combined = combined.melt(feature).dropna()

    # Get r2 using different degree polynomials
    row = [feature]
    poly_preds = []
    for degree in range(1,MAX_DEGREE+1):
        coefs = np.polyfit(combined[feature], combined["value"], deg=degree)
        predict = np.poly1d(coefs)
        preds = predict(combined[feature])
        poly_preds.append(preds)

        r2 = r2_score(combined["value"], preds)
        row.append(r2)
    r2_table.append(row)


r2_df = pd.DataFrame(r2_table, columns=column_names).sort_values("r2_d2", ascending = False)
r2_df.to_csv("Data exploration/r2 scores.csv")
r2_df

# Line plots

In [None]:
for feature in features:
    plt.figure(figsize=(10,6))
    sns.lineplot(data=features, x=features.index, y=feature)
    plt.show()

# Facet grid

In [None]:
MAX_DEGREE = 5

features_tqdm = tqdm(features.columns, total=len(features.columns), desc="Plots & r2 for features")
for feature in features_tqdm:
    path = f"Data exploration/Stock price against {feature} (facet grid).jpg"
    if not os.path.exists(path):
        plt.figure(figsize=(10,6),dpi=200)
        combined = pd.concat([features[[feature]], data], axis="columns")
        combined = combined.melt(feature).dropna()

        sectors = np.zeros(len(combined), dtype="object")
        var_tqdm = tqdm(enumerate(combined["variable"]), total=len(combined["variable"]), desc=f"Getting sectors for feature: {feature}")
        for i, ticker in var_tqdm:
            sectors[i] = cats[ticker.upper()][0]
        combined["sector"] = sectors

        g = sns.FacetGrid(data=combined, row="sector", sharey=False, height=4, aspect=2)
        g.map(sns.regplot, feature, "value", scatter=True, fit_reg=True, x_ci=None, ci=None, scatter_kws={"alpha":0.1, "s":20}, line_kws={"color": "red", "linewidth":3})

        print("Saving plot")
        plt.savefig(path,
                    bbox_inches='tight',
                    dpi = 300);
        print("Plot saved")

        plt.close("all")

# Correlation matrix between features

In [None]:
def plot_correlation_matrix_of_df(df, columns_order, annot = True):
    '''Uses the seaborn heatmap to plot the correlation matrix of a pandas dataframe'''
    # Sort columns based on column column name
    df = df[columns_order]

    # Calculate correlation matrix
    corrs = df.corr().round(2)
    
    # Since this is a symmetric table, set up a mask so that we only plot values below the main diagonal.
    mask = np.triu(np.ones_like(corrs, dtype=np.bool))
    f, ax = plt.subplots(figsize=(10, 8)) # Initialise the plots and axes

    # Plot the correlations as a seaborn heatmap, with a colourbar.
    sns.heatmap(corrs, mask=mask, center=0, annot=annot, square=True, linewidths=.5, cmap="seismic", vmin = -1, vmax = 1)

In [None]:
column_order = r2_df.set_index("features")["r2_d2"].sort_values(ascending=False).index
plot_correlation_matrix_of_df(features, column_order,annot = True)