## Data preprocessing

In [1]:
import pandas as pd
import numpy as np


class PreProcessing:
    data = None
    quarter_names = None
    num_years = None
    num_days = None

    def __init__(self, name):
        name= str(name)
        self.get_data(name)
        self.data['Normalized_Close'] = self.normalized_data_col(self.data)
        self.data['Quarter'] = self.get_quarter_col(self.data)
        self.num_days = 252
        self.prices_by_year = self.get_prices_by_year()
        self.quarter_length = int(self.num_days / 4)

    def get_prices_by_year(self):
        df = self.modify_first_year_data()
        for i in range(1, len(self.num_years)):
            df = pd.concat([df, pd.DataFrame(self.get_year_data(year=self.num_years[i], normalized=True))], axis=1)

        df = df[:self.num_days]

        quarter_col = []
        num_days_in_quarter = self.num_days // 4
        for j in range(0, len(self.quarter_names)):
            quarter_col.extend([self.quarter_names[j]]*num_days_in_quarter)
        quarter_col = pd.DataFrame(quarter_col)

        df = pd.concat([df, quarter_col], axis=1)
        df.columns = self.num_years + ['Quarter']
        df.index.name = 'Day'

        df = self.fill_nans_with_mean(df)

        return df

    def get_year_data(self, year, normalized=True):
        year = int(year)
        if year not in self.num_years:
            raise ValueError('\n' +
                             'Input year: {} not in available years: {}'.format(year, self.num_years))

        prices = (self.data.loc[self.data['Date'].dt.year == year])
        if normalized:
            return np.asarray(prices.loc[:, 'Normalized_Close'])
        else:
            return np.asarray(prices.loc[:, 'Adj Close'])

    def get_adj_close_prices(self, start_year, end_year):
        start_year,end_year  = int(start_year), int(end_year)
        if start_year < self.num_years[0] or end_year > self.num_years[-1]:
            raise ValueError('\n' +
                             'Incorrect data! \n' +
                             'Max range available: {}-{}\n'.format(self.num_years[0], self.num_years[-1]) +
                             'Was: {}-{}'.format(start_year, end_year))

        df = (self.data.loc[(self.data['Date'].dt.year >= start_year) & (self.data['Date'].dt.year <= end_year)])
        df = df.loc[:, ['Date', 'Adj Close']]

        return df

    def get_data(self, file_name):
        file_name = str(file_name)
        self.data = pd.read_csv("./datasetslib/data/"+ file_name + '.csv')
        self.data = self.data.iloc[:, [0, 5]]
        self.data = self.data.dropna()
        self.data.Date = pd.to_datetime(self.data.Date)
        self.quarter_names = ['Q' + str(i) for i in range(1, 5)]

    def normalized_data_col(self, df):
        price_normalized = pd.DataFrame()

        date_list = list(df.Date)
        self.num_years = sorted(list(set([date_list[i].year for i in range(0, len(date_list))])))

        for i in range(0, len(self.num_years)):
            prices_data = self.get_year_data(year=self.num_years[i], normalized=False)
            prices_data = [(prices_data[i] - np.mean(prices_data)) / np.std(prices_data) for i in range(0, len(prices_data))]
            prices_data = [(prices_data[i] - prices_data[0]) for i in range(0, len(prices_data))]
            price_normalized = price_normalized.append(prices_data, ignore_index=True)

        return price_normalized

    def get_quarter_col(self, df):
        quarters = pd.DataFrame()

        for i in range(0, len(self.num_years)):
            dates = list((df.loc[df['Date'].dt.year == self.num_years[i]]).iloc[:, 0])
            dates = pd.DataFrame([self.quarter_names[(int(dates[i].month) - 1) // 3] for i in range(0, len(dates))])
            quarters = quarters.append(dates, ignore_index=True)

        return quarters


    def modify_first_year_data(self):
        price_data = pd.DataFrame(self.get_year_data(self.num_years[0]))
        df = pd.DataFrame([0 for _ in range(self.num_days - len(price_data.index))])
        df = pd.concat([df, price_data], ignore_index=True)

        return df

    def fill_nans_with_mean(self, df):
        years = self.num_years[:-1]
        df_wo_last_year = df.loc[:,years]
        df_wo_last_year = df_wo_last_year.fillna(df_wo_last_year.mean())
        df_wo_last_year[self.num_years[-1]] = df[self.num_years[-1]]
        df= df_wo_last_year

        return df


## Gaussian Process model

In [2]:
import numpy as np
import pandas as pd
import gpflow


class GP:
    preprocessed_data = None
    kernel = None
    gp_model = None

    def __init__(self, company):
        self.preprocessed_data = PreProcessing(str(company))

    def make_gp_predictions(self, start_year, end_year, pred_year, pred_quarters = []):
        start_year, end_year, pred_year= int(start_year),int(end_year), int(pred_year)
        years_quarters = list(range(start_year, end_year + 1)) + ['Quarter']
        years_in_train = years_quarters[:-2]
        price_df = self.preprocessed_data.prices_by_year[self.preprocessed_data.prices_by_year.columns.intersection(years_quarters)]

        num_days_in_train = list(price_df.index.values)

        #Generating X and Y for Training
        first_year_prices = price_df[start_year]
        if start_year == self.preprocessed_data.num_years[0]:
            first_year_prices = (first_year_prices[first_year_prices.iloc[:] != 0])
            first_year_prices = (pd.Series([0.0], index=[first_year_prices.index[0]-1])).append(first_year_prices)

        first_year_days = list(first_year_prices.index.values)
        first_year_X = np.array([[start_year, day] for day in first_year_days])

        X = first_year_X
        Target = np.array(first_year_prices)
        for year in years_in_train[1:]:
            current_year_prices = list(price_df.loc[:, year])
            current_year_X = np.array([[year, day] for day in num_days_in_train])
            X = np.append(X, current_year_X, axis=0)
            Target = np.append(Target, current_year_prices)

        final_year_prices = price_df[end_year]
        final_year_prices = final_year_prices[final_year_prices.iloc[:].notnull()]

        final_year_days = list(final_year_prices.index.values)
        if pred_quarters is not None:
            length = 63 * (pred_quarters[0] - 1)
            final_year_days = final_year_days[:length]
            final_year_prices = final_year_prices[:length]
        final_year_X = np.array([[end_year, day] for day in final_year_days])

        X = np.append(X, final_year_X, axis=0)
        Target = np.append(Target, final_year_prices)

        if pred_quarters is not None:
            days_for_prediction = [day for day in
                                   range(63 * (pred_quarters[0]-1), 63 * pred_quarters[int(len(pred_quarters) != 1)])]
        else:
            days_for_prediction = list(range(0, self.preprocessed_data.num_days))
        x_mesh = np.linspace(days_for_prediction[0], days_for_prediction[-1]
                             , 2000)
        x_pred = ([[pred_year, x_mesh[i]] for i in range(len(x_mesh))])
        X = X.astype(np.float64)
        Target = np.expand_dims(Target, axis=1)
        kernel = gpflow.kernels.RBF(2, lengthscales=1, variance=63) + gpflow.kernels.White(2, variance=1e-10)
        self.gp_model = gpflow.models.GPR(X, Target, kern=kernel)
        gpflow.train.ScipyOptimizer().minimize(self.gp_model)
        y_mean, y_var = self.gp_model.predict_y(x_pred)

        return x_mesh, y_mean, y_var

## Plot

In [3]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np


class PlotData:
    company = None
    preprocessed_data = None
    gp_model = None

    def __init__(self, company):
        self.company = str(company)
        self.preprocessed_data = PreProcessing(str(company))
        self.gp_model = GP(str(company))

    def plot_normalized_prices(self, first_year, last_year):
        first_year, last_year = int(first_year), int(last_year)
        self.check_data(start_year=first_year, end_year=last_year)

        fig = plt.figure(num=self.company + ' normalized prices')
        ax = plt.gca()
        fig.set_size_inches(12, 6)
        lower_y, upper_y = 0, 0
        for year in range(first_year, last_year + 1):
            target = self.preprocessed_data.prices_by_year[year]
            lower_y = min(lower_y, min(target))
            upper_y = max(upper_y, max(target))
            x = np.linspace(0, len(target), len(target))
            plt.plot(x, target, alpha=.8, label=year)
            plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)

        y_max = max(abs(lower_y) - 1, abs(upper_y) + 1)
        x_min, x_max = -10, self.preprocessed_data.num_days + 10
        ax.set_ylim(bottom=-y_max, top=y_max)
        ax.set_xlim(left=x_min, right=x_max)

        for i in range(0, 5):
            plt.vlines(x=(self.preprocessed_data.num_days / 4) * i, ymin=-y_max, ymax=y_max, color='black', linestyles='--', alpha=.5,
                       zorder=-1)
            if i < 4:
                ax.text((self.preprocessed_data.num_days / 4) * i + self.preprocessed_data.num_days / 8 - 5, y_max - 0.5, self.preprocessed_data.quarter_names[i],
                        fontsize=12)
        plt.hlines(y=0, xmin=x_min, xmax=x_max, color='black', linestyles='--', alpha=.6, zorder=-1)

        plt.grid(True, alpha=.25)
        plt.title(self.company)
        plt.xlabel('Days')
        plt.ylabel('NormalizedPrices')

        plt.tight_layout()

        image_name = '{}_{}_{}_prices_normalized.png'.format(self.company, first_year, last_year)
        fig.savefig(image_name, dpi=fig.dpi)
        plt.clf()

    def plot_gp_predictions(self, train_start, train_end, pred_year, pred_quarters = None):
        train_start = int(train_start)
        train_end = int(train_end)
        pred_year = int(pred_year)
        self.check_data(start_year=train_start, end_year=pred_year)

        price_data = self.preprocessed_data.prices_by_year[pred_year]
        price_data = price_data[price_data.iloc[:].notnull()]

        fig = plt.figure(num=self.company + ' predicted prices')
        ax = plt.gca()
        fig.set_size_inches(12, 6)

        x_obs = list(range(price_data.index[0], price_data.index[-1] + 1))
        x_mesh, y_mean, y_var = self.gp_model.make_gp_predictions(start_year=train_start, end_year=train_end,
                                                                  pred_year=pred_year,
                                                                  pred_quarters=pred_quarters)
        y_lower = np.squeeze(y_mean - 1.96*np.sqrt(y_var))
        y_upper = np.squeeze(y_mean + 1.96*np.sqrt(y_var))
        y_max = max(abs(min(y_lower) - 1), abs(max(y_upper) + 1))
        ax.set_ylim(bottom=-y_max, top=y_max)

        x_min, x_max = -10, self.preprocessed_data.num_days + 10
        ax.set_xlim(left=x_min, right=x_max)

        plt.plot(x_obs, price_data, color='blue', alpha=.95, label=u'Actuals ' + str(pred_year), zorder=10)
        plt.plot(x_mesh, y_mean, color='red', linestyle='--', label=u'Predicted')
        plt.fill_between(x_mesh, y_lower, y_upper,
                         alpha=.25, label='95% confidence interval', color='red')

        handles, labels = plt.gca().get_legend_handles_labels()
        new_labels, new_handles = [], []
        for handle, label in zip(handles, labels):
            if label not in new_labels:
                new_labels.append(label)
                new_handles.append(handle)
        plt.legend(new_handles, new_labels, bbox_to_anchor=(0.01, 0.02), loc='lower left', borderaxespad=0.)

        for i in range(0, 5):
            plt.vlines(x=self.preprocessed_data.quarter_length * i, ymin=-y_max, ymax=y_max, color='black', linestyles='--', alpha=.5,
                       zorder=-1)
            if i < 4:
                ax.text(self.preprocessed_data.quarter_length * i + self.preprocessed_data.quarter_length / 2 - 5, y_max - 0.5, self.preprocessed_data.quarter_names[i],
                        fontsize=12)
        plt.hlines(y=0, xmin=x_min, xmax=x_max, color='black', linestyles='--', alpha=.6, zorder=-1)

        plt.grid(True, alpha=.25)
        plt.title(self.company)
        plt.xlabel('Days\n')
        plt.ylabel('NormalizedPrices')

        plt.tight_layout()

        image_name = '{}_{}_predicted.png'.format(self.company, pred_year)
        fig.savefig(image_name, dpi=fig.dpi)
        plt.clf()

    def plot_complete_history(self, intermediate = False):
        self.plot_prices_data(start_year=self.preprocessed_data.num_years[0], end_year=self.preprocessed_data.num_years[-1], intermediate=intermediate)

    def plot_prices_data(self, start_year, end_year, intermediate = True):
        start_year,end_year = int(start_year), int(end_year)
        self.check_data(start_year=start_year, end_year=end_year)

        data = self.preprocessed_data.get_adj_close_prices(start_year=start_year, end_year=end_year)

        fig = plt.figure(num=self.company + ' prices')
        fig.set_size_inches(12, 6)
        plt.plot(data.iloc[:, 0], data.iloc[:, 1], color='green', alpha=.95,
                 label=u'PriceData ' + str(start_year) + '-' + str(end_year), zorder=10)
        ax = plt.gca()

        x_ticks = [data[data['Date'].dt.year == year].iloc[0, 0] for year in range(start_year,end_year + 1)]
        x_ticks.append(data[data['Date'].dt.year == end_year].iloc[-1, 0]) # Appending the last date

        ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
        if not intermediate:
            x_ticks = [x_ticks[0], x_ticks[-2], x_ticks[-1]]
            ax.set_xticks([x_ticks[0], x_ticks[-1]])
        else:
            ax.set_xticks(x_ticks)
        plt.xticks(rotation=20)
        y_min, y_max = ax.get_ylim()
        x_min, x_max = ax.get_xlim()
        ax.set_ylim(bottom=y_min, top=y_max)
        ax.set_xlim(left=x_min, right=x_max)

        for i in range(0, len(x_ticks)):
            plt.vlines(x=x_ticks[i], ymin=y_min, ymax=y_max, color='black', linestyles='--', alpha=.5,
                       zorder=-1)

        plt.grid(True, alpha=0.25)
        plt.legend()
        plt.title(self.company)
        plt.ylabel('Price')

        plt.tight_layout()

        fname = '{}_{}_{}_adj_closing_prices.png'.format(self.company, start_year, end_year)
        fig.savefig(fname, dpi=fig.dpi)
        plt.clf()

    def check_data(self, start_year, end_year):
        if int(start_year) < self.preprocessed_data.num_years[0] or int(end_year) > self.preprocessed_data.num_years[-1]:
            raise ValueError('\n' +
                             'Incorrect data! \n' +
                             'Max range available: {}-{}\n'.format(self.preprocessed_data.num_years[0], self.preprocessed_data.num_years[-1]) +
                             'Was: {}-{}'.format(int(start_year), int(end_year)))


## main

In [4]:
import os


start_year = 2008

def main():
    company_list = []
    plot_objects = {}
    for company_file in os.listdir('datasetslib/Data'):
        if company_file == '.DS_Store':
            continue
        company = company_file.split('.')[0]
        company_list.append(company)
        plot_objects[company] = PlotData(company=company)

    for company in company_list:
        print ("Company Name ", company)
        do_work(company,plot_objects[company])
    print("Done!")


def do_work(company,plot_data):
    plot_data.plot_complete_history()
    plot_data.plot_prices_data(start_year=start_year, end_year=2016)
    plot_data.plot_normalized_prices(first_year=start_year, last_year=2016)
    plot_data.plot_gp_predictions(train_start=start_year, train_end=2016, pred_year=2017)
    plot_data.plot_prices_data(start_year=start_year, end_year=2018)
    plot_data.plot_gp_predictions(train_start=start_year, train_end=2018, pred_year=2018, pred_quarters=[4])
    print(company + ' summary done!')

if __name__ == "__main__":
    main()


Company Name  GOOG



To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3992.331132
  Number of iterations: 49
  Number of functions evaluations: 81


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3992.331132
  Number of iterations: 49
  Number of functions evaluations: 81


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 4623.711955
  Number of iterations: 42
  Number of functions evaluations: 66


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 4623.711955
  Number of iterations: 42
  Number of functions evaluations: 66


GOOG summary done!
Company Name  NFLX




INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 2967.681179
  Number of iterations: 60
  Number of functions evaluations: 101


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 2967.681179
  Number of iterations: 60
  Number of functions evaluations: 101


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3900.064191
  Number of iterations: 49
  Number of functions evaluations: 69


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3900.064191
  Number of iterations: 49
  Number of functions evaluations: 69


NFLX summary done!
Company Name  GE




INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3314.993826
  Number of iterations: 50
  Number of functions evaluations: 73


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 3314.993826
  Number of iterations: 50
  Number of functions evaluations: 73


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 4540.918794
  Number of iterations: 61
  Number of functions evaluations: 92


INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
  Objective function value: 4540.918794
  Number of iterations: 61
  Number of functions evaluations: 92


GE summary done!
Done!


<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>

<Figure size 864x432 with 0 Axes>