In [1]:
# %load my_imports.py
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv

import datetime
import time

import json

In [2]:
# import data dynamically
import math

def load_n_compute_log_return(file_name):
    df = pd.read_csv('data/' + file_name)
    df = df['Close'].apply(lambda x: math.log(x))
    log_return = df.pct_change()[1:]*100
    return log_return

def load_n_select_monday_n_compute_log_return(file_name):
    df = pd.read_csv('data/' + file_name)
    df['Date'] = pd.to_datetime(df['Date'])
    log_price = df.ix[df['Date'].dt.weekday == 0, 
                      'Close'].apply(lambda x: math.log(x)).reset_index(drop=True)
    log_return = log_price.pct_change()[1:]*100
    return log_return

def load_n_compute_mean_n_stdv(file_name):
    df = pd.read_csv('data/' + file_name)
    df = df['Close'].apply(lambda x: math.log(x))
    log_return = df.pct_change()[1:]*100
    return np.mean(log_return), np.std(log_return)

In [21]:
# Problem 1
#Compute Correlation 
# i, j: stock i n j
def compute_cor(i, j):
    r_i = load_n_compute_log_return(list_of_files[i])
    r_j = load_n_compute_log_return(list_of_files[j])
    r_i_aver = np.mean(r_i)
    r_j_aver = np.mean(r_j)
    r_i_sqr_aver = np.mean(np.square(r_i))
    r_i_aver_sqr = np.square(np.mean(r_i))
    r_j_aver_sqr = np.square(np.mean(r_j))
    r_j_sqr_aver = np.mean(np.square(r_j))
    r_ij_aver = np.mean(r_i*r_j)
    numerator = r_ij_aver - r_i_aver*r_j_aver
    denominator = np.sqrt((r_i_sqr_aver - r_i_aver_sqr)*(r_j_sqr_aver - r_j_aver_sqr))
    p_ij = numerator/denominator
    d_ij = np.sqrt(2*(1-p_ij))
    return p_ij, d_ij

def compute_cor_weekly(i, j):
    r_i = load_n_select_monday_n_compute_log_return(list_of_files[i])
    r_j = load_n_select_monday_n_compute_log_return(list_of_files[j])
    r_i_aver = np.mean(r_i)
    r_j_aver = np.mean(r_j)
    r_i_sqr_aver = np.mean(np.square(r_i))
    r_i_aver_sqr = np.square(np.mean(r_i))
    r_j_aver_sqr = np.square(np.mean(r_j))
    r_j_sqr_aver = np.mean(np.square(r_j))
    r_ij_aver = np.mean(r_i*r_j)
    numerator = r_ij_aver - r_i_aver*r_j_aver
    denominator = np.sqrt((r_i_sqr_aver - r_i_aver_sqr)*(r_j_sqr_aver - r_j_aver_sqr))
    p_ij = numerator/denominator
    d_ij = np.sqrt(2*(1-p_ij))
    return p_ij, d_ij

In [24]:
# create a dict that tracks the names
import os
list_of_files = os.listdir('data')
list_of_corps = [file_name[:-4] for file_name in list_of_files]
indexes = [x for x in range(len(list_of_files))]
file_dict = {}
for idx in indexes:
    file_dict.update({list_of_corps[idx]: idx})
file_dict_reversed = dict((y, x) for (x, y) in file_dict.iteritems())

In [None]:
# generate 
import itertools
edge_list = []
corr_list = []
for pair in itertools.combinations(indexes, 2):
    # pair[0] and pair[1]
#     weight = compute_cor(pair[0], pair[1])
    p_ij, d_ij = compute_cor(pair[0], pair[1])
    corr_list.append([pair[0], pair[1], p_ij])
    edge_list.append([pair[0], pair[1], d_ij])
corrlist = pd.DataFrame(corr_list)
edgelist = pd.DataFrame(edge_list)
corrlist.to_csv('Corrlog/corrlist_day.csv', header = 0, index = 0)
edgelist.to_csv('Network/edgelist_day.csv', header = 0, index = 0)
print(len(edge_list))

In [4]:
df = pd.read_csv('Network/edgelist_day.csv', header = None)
df.columns = ['from', 'to', 'weight']
df.head()

Unnamed: 0,from,to,weight
0,0,1,1.181978
1,0,2,1.26376
2,0,3,1.352927
3,0,4,1.120446
4,0,5,1.244592


In [None]:
plt.hist(df.weight, bins= 50)
plt.xlabel("$d_{ij}$")
plt.ylabel("Frequency")
plt.grid("on")
# plt.show()
plt.savefig('figures/02hist', dpi = 500)
plt.gcf().clear()

In [None]:
# create a dict for sectors
df = pd.read_csv('Name_sector.csv', header = 0)
stock_sector_dict = dict(zip(df.Symbol, df.Sector))
sector_dict = {}
sector_list = list(set(df.Sector))
index_list = [x+1 for x in range(len(sector_list))]
for idx in range(len(sector_list)):
    sector_dict.update({sector_list[idx]: index_list[idx]})
sector_dict

In [None]:
# create a sector - color index file
color_list = []
for key, value in stock_sector_dict.iteritems():
    color_list.append([file_dict[key], sector_dict[value]])
pd.DataFrame(color_list).sort_values(by = 0).to_csv('Network/colorlist.csv', header=0, index=0)

In [None]:
# generate Monday only
import itertools
edge_list = []
corr_list = []
for pair in itertools.combinations(indexes, 2):
    # pair[0] and pair[1]
#     weight = compute_cor(pair[0], pair[1])
    p_ij, d_ij = compute_cor_weekly(pair[0], pair[1])
    corr_list.append([pair[0], pair[1], p_ij])
    edge_list.append([pair[0], pair[1], d_ij])
corrlist = pd.DataFrame(corr_list)
edgelist = pd.DataFrame(edge_list)
corrlist.to_csv('Corrlog/corrlist_week.csv', header = 0, index = 0)
edgelist.to_csv('Network/edgelist_week.csv', header = 0, index = 0)
print(len(edge_list))

In [9]:
# plot hist of p_ij
# and fit a normal curve
from scipy.stats import norm
df = pd.read_csv('Corrlog/corrlist_day.csv', header = None)
df.columns = ['from', 'to', 'corr']

plt.hist(df['corr'], bins= 50, normed = True)
(mu, sigma) = norm.fit(df['corr'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
plt.plot(x, p, 'k', linewidth=2)

# plt.hist(p, bins = 50)

plt.xlabel("$p_{ij}$")
plt.ylabel("Probability Density")
plt.grid("on")

# plt.show()
plt.savefig('figures/07hist', dpi = 500)
plt.gcf().clear()

In [49]:
# adjust corr values
df_new = df
df_new['corr'].apply(lambda x: -1 if x >= 0.3 else x)
df_new['corr'].apply(lambda x: np.sqrt(2*(1 - x)))
df.to_csv('Network/edgelist_corr.csv', header = 0, index = 0)

In [10]:
# distribution of the mean, deviation
mean_dis = []
stdv_dis = []
for file_name in list_of_files:
    mean, stdv = load_n_compute_mean_n_stdv(file_name)
    mean_dis.append(mean)
    stdv_dis.append(stdv)

In [16]:
plt.hist(mean_dis, bins = 50)
plt.xlabel("$Mean\ Value$")
plt.ylabel("Frequency")
plt.grid("on")
# plt.show()
plt.savefig('figures/05mean', dpi = 500)
plt.gcf().clear()

In [25]:
eul = pd.read_csv('euler/euler.csv')
eul = eul["x"]
eul = eul.tolist()
b = []
_ = [b.append(item) for item in eul if item not in b]

In [23]:
pd.DataFrame(b).to_csv('euler/tour.csv', header = 0, index = 0)

In [11]:
# construct weighted adj matrix
with open ('Network/edgelist_day.csv') as f:
    contents = f.readlines()
n = 505

In [19]:
adj = np.zeros(shape = (n, n))

In [20]:
for content in contents:
    line = re.split(r"[~,\n]+", content)
    i = int(line[0])
    j = int(line[1])
    weight = float(line[2])
    adj[i, j] = weight
    adj[j, i] = weight

In [23]:
pd.DataFrame(adj).to_csv('Network/adj.csv', header = 0, index = 0)

In [33]:
# test length
for afile in list_of_files:
    df = pd.read_csv('data/' + file_name)
    if len(df.index) != 765:
        print 1