# EDA - Price History

In [1]:
from pathlib import Path
price_history_csv_file = Path().cwd().parent / 'data' / 'price_history.csv'

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Create dataframe from csv.
price_history = pd.read_csv(price_history_csv_file)

In [3]:
# Quick look at the data.
price_history.head()

Unnamed: 0,ticker_symbol,date,open,high,low,close,adj_close,volume
0,STK001,2024-01-02,322.69,327.73,316.67,321.67,321.67,988746
1,STK001,2024-01-03,321.67,321.92,318.29,321.76,321.76,1184254
2,STK001,2024-01-04,321.76,327.13,316.44,321.4,321.4,740813
3,STK001,2024-01-05,321.4,326.17,314.08,320.21,320.21,1042278
4,STK001,2024-01-08,320.21,324.25,314.74,322.14,322.14,860784


In [4]:
# Non-null count and Dtypes for each column.
price_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126000 entries, 0 to 125999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticker_symbol  126000 non-null  object 
 1   date           126000 non-null  object 
 2   open           126000 non-null  float64
 3   high           126000 non-null  float64
 4   low            126000 non-null  float64
 5   close          126000 non-null  float64
 6   adj_close      126000 non-null  float64
 7   volume         126000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 7.7+ MB


In [5]:
# Show dataframe stats.
print(f'Shape of price_history dataframe: {price_history.shape}')
price_history.describe(include='all')

Shape of price_history dataframe: (126000, 8)


Unnamed: 0,ticker_symbol,date,open,high,low,close,adj_close,volume
count,126000,126000,126000.0,126000.0,126000.0,126000.0,126000.0,126000.0
unique,500,252,,,,,,
top,STK001,2024-01-02,,,,,,
freq,252,500,,,,,,
mean,,,270.09,273.77,266.54,270.18,270.18,1001173.85
std,,,163.69,165.99,161.59,163.85,163.85,200422.89
min,,,11.93,11.98,11.7,11.93,11.93,135172.0
25%,,,132.93,134.68,131.21,132.93,132.93,866691.25
50%,,,259.04,262.47,255.65,259.03,259.03,1001124.5
75%,,,383.41,388.5,378.27,383.39,383.39,1136545.25


In [6]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
price_history = price_history.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
price_history.columns = price_history.columns.str.strip().str.lower()
price_history.describe(include='all')

Unnamed: 0,ticker_symbol,date,open,high,low,close,adj_close,volume
count,126000,126000,126000.0,126000.0,126000.0,126000.0,126000.0,126000.0
unique,500,252,,,,,,
top,stk001,2024-01-02,,,,,,
freq,252,500,,,,,,
mean,,,270.09,273.77,266.54,270.18,270.18,1001173.85
std,,,163.69,165.99,161.59,163.85,163.85,200422.89
min,,,11.93,11.98,11.7,11.93,11.93,135172.0
25%,,,132.93,134.68,131.21,132.93,132.93,866691.25
50%,,,259.04,262.47,255.65,259.03,259.03,1001124.5
75%,,,383.41,388.5,378.27,383.39,383.39,1136545.25


In [7]:
# Check for dupes.
print(f'Duplicate values in price history:\n{price_history.apply(lambda x: x.duplicated().sum())}')

Duplicate values in price history:
ticker_symbol    125500
date             125748
open                  0
high                  0
low                   0
close                 0
adj_close             0
volume            10360
dtype: int64


In [8]:
# Check for missing values.
print(f'Missing values in price history:\n{price_history.isna().sum()}')

Missing values in price history:
ticker_symbol    0
date             0
open             0
high             0
low              0
close            0
adj_close        0
volume           0
dtype: int64


In [9]:
# Check ticker symbols.
price_history_tickers = list(price_history['ticker_symbol'].unique())
price_history_tickers

['stk001',
 'stk002',
 'stk003',
 'stk004',
 'stk005',
 'stk006',
 'stk007',
 'stk008',
 'stk009',
 'stk010',
 'stk011',
 'stk012',
 'stk013',
 'stk014',
 'stk015',
 'stk016',
 'stk017',
 'stk018',
 'stk019',
 'stk020',
 'stk021',
 'stk022',
 'stk023',
 'stk024',
 'stk025',
 'stk026',
 'stk027',
 'stk028',
 'stk029',
 'stk030',
 'stk031',
 'stk032',
 'stk033',
 'stk034',
 'stk035',
 'stk036',
 'stk037',
 'stk038',
 'stk039',
 'stk040',
 'stk041',
 'stk042',
 'stk043',
 'stk044',
 'stk045',
 'stk046',
 'stk047',
 'stk048',
 'stk049',
 'stk050',
 'stk051',
 'stk052',
 'stk053',
 'stk054',
 'stk055',
 'stk056',
 'stk057',
 'stk058',
 'stk059',
 'stk060',
 'stk061',
 'stk062',
 'stk063',
 'stk064',
 'stk065',
 'stk066',
 'stk067',
 'stk068',
 'stk069',
 'stk070',
 'stk071',
 'stk072',
 'stk073',
 'stk074',
 'stk075',
 'stk076',
 'stk077',
 'stk078',
 'stk079',
 'stk080',
 'stk081',
 'stk082',
 'stk083',
 'stk084',
 'stk085',
 'stk086',
 'stk087',
 'stk088',
 'stk089',
 'stk090',
 'stk091',

## Notes
- date column should be converted to datetime
- volume should be converted to float
- values go from stk001 to stk500