In [1]:
import os

import numpy as np
import pandas as pd

os.listdir('./data_codes/')

['.DS_Store',
 'lse_codes_companies.csv',
 'lse_ftse100_stock_codes_scraped_html.txt',
 'lse_all_share_yahoo_codes.txt',
 'lse_ftse350_stock_codes_scraped_html.txt',
 'lse_codes_companies.numbers']

In [2]:
data_dir = './data_stocks/ftse100/'
stocks_csv_files = [os.path.join(data_dir, x) for x in os.listdir(data_dir) if x.endswith('.L.csv')]
companies_df = pd.read_csv('./data_codes/lse_codes_companies.csv', usecols=[0, 1])
BARC = os.path.join(data_dir, 'BARC.L.csv')
barc_df = pd.read_csv(BARC)

In [3]:
def file_to_symbol(file):
    """Takes a filepath and returns the symbol"""
    return file.rsplit('/', 1)[1].rsplit('.', 1)[0]

In [4]:
barc_df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2014-01-02,274.651001,268.350006,273.0,271.049988,30563664.0,235.943085
1,2014-01-03,273.910004,270.399994,271.25,272.850006,23678963.0,237.51001
2,2014-01-06,278.666992,271.200012,271.549988,277.5,47008835.0,241.557693
3,2014-01-07,282.778015,275.0,276.950012,280.950012,44374624.0,244.560913
4,2014-01-08,285.899994,281.25,282.350006,283.700012,45068534.0,246.954666


In [5]:
def get_data(code):
    return pd.read_csv(data_dir + '/' + code + '.csv')

In [6]:
def highest_close_price(df):
    return df.loc[df['Close'].idxmax(), 'Close']
highest_close_price(barc_df)

296.5

In [7]:
def highest_close_date(df):
    return df.loc[df['Close'].idxmax(), 'Date']
highest_close_date(barc_df)

'2014-01-15'

In [8]:
def latest_close_price(df):
    return df.loc[df.index[-1], 'Close']
latest_close_price(barc_df)

165.94000244140625

In [9]:
def latest_close_date(df):
    return df.loc[df.index[-1], 'Date']
latest_close_date(barc_df)

'2019-03-15'

In [10]:
def highest_current_percentage_diff(df):
    """
    Returns the percentage (2 d.p.) difference between
    the highest and current closing price
    """
    highest_close = df.loc[df['Close'].idxmax(), 'Close']
    current_close = df.loc[df.index[-1], 'Close']
    return round((current_close - highest_close) / current_close, 2) * 100

highest_current_percentage_diff(barc_df)

-79.0

In [11]:
# Percentage difference between highest close and current close for all stocks
def percentage_diff_df(stocks_csv_files):
    result = []
    for file in stocks_csv_files:
        df = pd.read_csv(file)
        percentage = highest_current_percentage_diff(df)
        code = file.rsplit('/')[-1].replace('.csv', '')
        result.append((percentage, code))
    agg_df = (
        pd.DataFrame(result, columns=['percentage', 'code'])
        .sort_values(by='percentage', ascending=True)
    )
    return agg_df

agg_df = percentage_diff_df(stocks_csv_files)

In [12]:
agg_df.head(10)

Unnamed: 0,percentage,code
37,-186.0,CNA.L
5,-150.0,FRES.L
88,-125.0,WPP.L
40,-123.0,TUI.L
81,-122.0,SMT.L
89,-122.0,BT-A.L
54,-121.0,MKS.L
63,-112.0,SLA.L
86,-112.0,STAN.L
92,-107.0,ITV.L


In [13]:
# Merge with company names
if 'company' not in agg_df.columns:
    agg_df = pd.merge(agg_df, companies_df, on='code', how='left', validate='one_to_one')

In [14]:
# # Export agg_df to csv file
# agg_df.to_csv('./data_aggregated/highest_close_percentage_difference.csv')

In [15]:
agg_df.head(10)

Unnamed: 0,percentage,code,Company
0,-186.0,CNA.L,Centrica plc
1,-150.0,FRES.L,Fresnillo PLC
2,-125.0,WPP.L,WPP plc
3,-123.0,TUI.L,TUI AG
4,-122.0,SMT.L,Scottish Mortgage Investment Trust PLC
5,-122.0,BT-A.L,BT Group plc
6,-121.0,MKS.L,Marks and Spencer Group plc
7,-112.0,SLA.L,Standard Life Aberdeen plc
8,-112.0,STAN.L,Standard Chartered PLC
9,-107.0,ITV.L,ITV plc


In [27]:
def add_moving_average(df, days):
    column = 'ma_{}'.format(days)
    df[column] = df['Close'].rolling(window=days).mean()
    return df

In [18]:
def add_ma_position(df, ma_1=50, ma_2=200):
    """
    Creates a new column 'ma_position' with possible
    values -1, +1.
    -1 when shorter SMA is below longer SMA
    +1 when shorter SMA is above longer SMA
    """
    column = 'ma_position'
    ma_1_df = df['Close'].rolling(window=ma_1).mean()
    ma_2_df = df['Close'].rolling(window=ma_2).mean()
    df[column] = np.where(ma_1_df > ma_2_df, 1, -1)
    return df

In [19]:
def add_ma_crossover(df, ma_1=50, ma_2=200, close_column='Close'):
    """
    Takes stock data and adds a column 'ma_crossover'
    which is 1 on the day where two moving averages cross over. 
    """
    column = 'ma_crossover'
    
    # Get moving averages
    ma_1_df = df['Close'].rolling(window=ma_1).mean()
    ma_2_df = df['Close'].rolling(window=ma_2).mean()

    # MA_1 below MA_2 >>> -1, MA_1 above MA_2 >>> +1
    position = pd.Series(np.where(ma_1_df > ma_2_df, 1, -1))
    
    # 1 if there is a change in position, otherwise 0
    df[column] = np.where(position != position.shift(1).fillna(method='bfill'), 1, 0)
    
    return df

def ma_crossover_dates(df):
    df = add_ma_crossover(df)
    return df[df['ma_crossover'] == 1]

In [32]:
# Name of stock and dates of moving average crossover
def get_crossovers_df(stocks_csv_files):
    """
    Loop over all csv files containing stock data
    and return a dataframe on days where crossover
    took place.
    """
    crossover_df = pd.DataFrame()
    for file in stocks_csv_files:
        df = pd.read_csv(file)
        df = add_ma_crossover(df)
        df = add_ma_position(df)
        df = df[(df['ma_crossover'] == 1) & (df['ma_position'] == 1)]
        df['code'] = file_to_symbol(file)
        crossover_df = pd.concat([crossover_df, df])
    return crossover_df
crossover_df = get_crossovers_df(stocks_csv_files)

In [39]:
# crossover_df.sort_values(by='Date').to_csv('./data_aggregated/moving_average_crossovers.csv')

In [44]:
crossover_df.sort_values(by='Date').tail(20)

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close,ma_crossover,ma_position,code
1302,2019-02-25,2044.0,2011.0,2044.0,2026.0,928558.0,2026.0,1,1,SVT.L
1303,2019-02-26,3920.0,3807.0,3807.0,3920.0,801898.0,3920.0,1,1,BKG.L
1304,2019-02-27,2622.0,2542.0,2607.0,2543.0,1890755.0,2543.0,1,1,CCH.L
1305,2019-02-28,356.700012,351.899994,353.0,356.700012,4356540.0,356.700012,1,1,DLG.L
1306,2019-03-01,606.0,598.200012,604.0,600.400024,3583583.0,600.400024,1,1,BDEV.L
1306,2019-03-01,4695.0,4391.0,4417.0,4695.0,1452144.0,4695.0,1,1,LSE.L
1307,2019-03-05,288.399994,284.0,285.0,286.200012,19496994.0,286.200012,1,1,LGEN.L
1311,2019-03-08,876.900024,862.099976,864.900024,873.400024,7959837.0,873.400024,1,1,NG.L
1312,2019-03-11,4882.0,4795.0,4810.0,4882.0,435635.0,4882.0,1,1,CRDA.L
1312,2019-03-11,1062.5,1038.5,1054.0,1062.5,1278300.0,1062.5,1,1,OCDO.L


In [None]:
plt.figure()
rbs_df = pd.read_csv('./data_stocks/ftse100/RBS.L.csv')['Close']


In [None]:
# def moving_average_difference(df, MA_1='MA_50', MA_2='MA_200'):
#     column = 'moving_average_difference'
#     df[column] = df[MA_1] - df[MA_2]
#     return df

# if 'moving_average_difference' not in barc_df.columns:
#     barc_df = moving_average_difference(barc_df)