In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import plotly.express as px

In [4]:
price = pd.read_csv("data/price.csv")

# Set date-time index
price = price.set_index(pd.to_datetime(price["Date"], format='%Y/%m/%d'))
price.drop(columns = ["Date"], inplace = True)

# Drop all columns which only contain nan values
boolean = []
for stock in price.columns:
    boolean.append(not price[stock].isnull().all())
price = price.iloc[:, boolean]

price=(price-price.mean())/price.std()
price.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/price.csv'

In [None]:
describe = price.describe()
describe

# Line plots

In [None]:
price_monthly_resampling = price.resample("1M").mean()
price_monthly_resampling.head()

## Line plot using normal scale

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(data=price_monthly_resampling, legend=False, palette=['b']*price_monthly_resampling.shape[1], dashes=False, alpha=0.2);

# Distribution of statistics of features

Notes:
* Try to create the last bin in the histogram have all the values above that particular bin
* I don't think the barplots are neccessary.

In [None]:
# for stat in ('min', 'mean', 'max', 'std', 'count'):
#     plt.figure(figsize=(15,8))
#     stat_stocks = describe.loc[stat,:].sort_values()
#     sns.barplot(x = price.columns, y = stat_stocks)
#
#     plt.figure(figsize=(15,8))
#     sns.histplot(stat_stocks)

# Grouping stocks

In [None]:
# Remove the variation which exists in all the stocks !!
means = price.mean(axis=1).values
for r in range(price.shape[0]):
    price.iloc[r,:] = price.iloc[r,:] - means[r]
price=(price-price.mean())/price.std()
price

In [None]:
price_monthly_resampling = price.resample("1M").mean()
plt.figure(figsize=(15,8))
sns.lineplot(data=price_monthly_resampling, legend=False, palette=['b']*price_monthly_resampling.shape[1], dashes=False, alpha=0.2);

In [None]:
scores_df = price.corr() #.iloc[1:,:price.shape[1]-1]

scores = np.nan_to_num(scores_df.to_numpy())
scores[np.triu_indices(scores.shape[1])] = np.nan
scores_df = pd.DataFrame(scores, index = scores_df.index, columns=scores_df.columns)
# scores_df = scores_df.iloc[1:,:scores_df.shape[1]-1]
scores_df

## Methods for grouping

In [None]:
# Function to get index of max value in array without considering a set of indices
def get_max_index_except_indices(arr, exceptions):
    idx = list(range(len(arr)))   # simple array of index
    a1 = np.delete(idx, exceptions)   # remove exceptions from idx (i.e., index)
    a2 = np.argmax(np.delete(arr, exceptions))   # get index of the max value after removing exceptions from actual arr array
    return a1[a2] # as a1 and a2 are in sync, this will give the original index of the max value

In [None]:
def flatten_table(groups):
    lst = []
    for i in groups:
        lst += i
    return set(lst)

In [None]:
# Checks if a stock has any stocks left to compare to that aren't already grouped
def stocks_left_to_compare_with(row_index_of_stock, already_grouped_from_columns, already_grouped_from_index):
    flat_groups = flatten_table(groups)
    stocks_to_compare = scores_columns[:row_index_of_stock+1]
    for s in stocks_to_compare:
        if s not in flat_groups:
            return True
    return False

In [None]:
def indices_of_grouped_stocks(row_index_of_stock):
    indices = []
    flat_groups = flatten_table(groups)
    stocks_in_row = scores_columns[:row_index_of_stock+1]

    for i, stock in enumerate(stocks_in_row):
        if stock in flat_groups:
            indices.append(i)

    return indices

In [None]:
# Find index of max value in matrix excluding certain rows and columns
def max_matrix_excluding(rows_exclude, cols_exclude):
    rows_exclude, cols_exclude = list(rows_exclude), list(cols_exclude)
    over_last_index = len(price.columns.drop(stocks_to_drop))-1

    for i in rows_exclude:
        if i+1 not in (-1, over_last_index):
            cols_exclude.append(i+1)

    for i in cols_exclude:
        if i-1 not in (-1, over_last_index):
            rows_exclude.append(i-1)

    idx = list(range(len(scores_index)))
    cols_idx = list(range(len(scores_columns)))
    idx_a1 = np.delete(idx, rows_exclude)  # remove exceptions from idx (i.e., index)
    cols_a1 = np.delete(cols_idx, cols_exclude)   # remove exceptions from idx (i.e., index)

    # Remove exceptions from actual matrix and get the index of the max
    new_scores = np.delete(scores, cols_exclude, 1) # remove cols
    new_scores = np.delete(new_scores, rows_exclude, 0) # remove rows

    i, j = np.unravel_index(np.nanargmax(new_scores), new_scores.shape)

    return idx_a1[i], cols_a1[j]

In [None]:
def remaining_stocks(scores, rows_exclude, cols_exclude):
    rows_exclude, cols_exclude = list(rows_exclude), list(cols_exclude)
    over_last_index = len(price.columns.drop(stocks_to_drop))-1

    for i in rows_exclude:
        if i+1 not in (-1, over_last_index):
            cols_exclude.append(i+1)

    for i in cols_exclude:
        if i-1 not in (-1, over_last_index):
            rows_exclude.append(i-1)

    # Remove exceptions from actual matrix and get the index of the max
    new_scores = np.delete(scores, cols_exclude, 1) # remove cols
    new_scores = np.delete(new_scores, rows_exclude, 0) # remove rows

    return new_scores

In [None]:
def number_of_groups_with_length_greater_than_2(groups):
    length = 0
    for group in groups:
        if len(group)>2:
            length += 1
    return length

In [None]:
# For a particular grouping, find the median score for each group of stocks and find the mean of those medians and
# return it

def score_grouping(groups):
    scores = scores_df.copy()

    medians = []
    for group in groups:
        if len(group) == 1:
            continue
        group_scores = []
        for i, stock_i in enumerate(group):
            for stock_j in group[i+1:]:
                try:
                    s = scores.loc[stock_i,stock_j]
                except KeyError:
                    s = scores.loc[stock_j,stock_i]
                    group_scores.append(s)
                    continue
                else:
                    if np.isnan(s):
                        s = scores.loc[stock_j,stock_i]
                        group_scores.append(s)
                        continue
                    group_scores.append(s)
                    continue
        medians.append(np.median(group_scores))
    # The following is to lower the score for groupings which have many small groups and decrease the score for groupings which have too many groups.
    lower = sum(1/np.array([len(group) for group in groups])) # The smaller groups, the larger the number becomes
    return np.median(medians) * (1/lower) # * np.median([len(group) for group in groups])

In [None]:
rsquares = np.linspace(0.5, 1, 11)[:-1]
rsquare_groups = []
lengths_rsquare_groups = []

for i, rsquare in enumerate(rsquares):
    groups = []
    columns_tqdm = tqdm(scores_df.columns, desc=f"r2={rsquare}, number: {i+1}/{len(rsquares)}", total = len(scores_df.columns))
    for stock_c in columns_tqdm:
        flattened_groups = flatten_table(groups)
        if stock_c in flattened_groups:
            continue
        group = [stock_c]
        for stock_r in scores_df.index:
            if stock_r in flattened_groups:
                continue
            if scores_df.loc[stock_r, stock_c] > rsquare:
                group.append(stock_r)
        if len(group) > 1:
            groups.append(group)
    groups = sorted(groups, key=lambda lst: -len(lst))
    rsquare_groups.append(groups)
    lengths_rsquare_groups.append([len(g) for g in groups])

In [None]:
results = pd.DataFrame(lengths_rsquare_groups, index=rsquares).transpose()
results

In [None]:
sns.lineplot(data = results);

In [None]:
fig = px.line(results)
fig.show()

In [None]:
# groups = all_groups_created[results['score'].idxmax()]

plt.figure(figsize=(15,8))
for group in groups:
    sns.lineplot(data=price_monthly_resampling[group].mean(axis="columns"), legend=False, palette=['black']*len(group), dashes=False, alpha=0.5, linewidth=3)
plt.title(f"Average of each group")
plt.ylim(-4,8)
plt.show()

for group in groups:
    plt.figure(figsize=(15,8))
    sns.lineplot(data=price_monthly_resampling[group], legend=False, palette=['b']*len(group), dashes=False, alpha=0.05)
    plt.title(f"Showing {len(group)} stocks")
    plt.ylim(-4,8)
    plt.show()

# Compare with stocks