In [None]:
# intial setup

import pandas as pd
stock_data = pd.read_csv('../data/raw/stock_data.csv')
print(stock_data.head())

         Date       AAPL      GOOGL       MSFT
0  2015-01-02  24.261045  26.319653  39.933064
1  2015-01-05  23.577576  25.818159  39.565838
2  2015-01-06  23.579798  25.180983  38.985107
3  2015-01-07  23.910433  25.106924  39.480442
4  2015-01-08  24.829126  25.194403  40.641872


In [35]:
# accessing attributes of data

# headers
print(stock_data.columns)

# individual tickers
print(stock_data.AAPL[0:5])

# multiple tickers
print(stock_data[['AAPL', 'GOOGL']][0:5])

# specific row
print(stock_data.iloc[3])

# loop through rows
for index, row in stock_data.iterrows():
    if index < 3:
        print(row['AAPL'], row['GOOGL'])

# stats
tickers = stock_data.columns[1:] # exclude date column
for ticker in tickers:
    print(f"{ticker}: Mean={stock_data[ticker].mean()}, Max={stock_data[ticker].max()}, Min={stock_data[ticker].min()}, Std={stock_data[ticker].std()}")


Index(['Date', 'AAPL', 'GOOGL', 'MSFT'], dtype='object')
0    24.261045
1    23.577576
2    23.579798
3    23.910433
4    24.829126
Name: AAPL, dtype: float64
        AAPL      GOOGL
0  24.261045  26.319653
1  23.577576  25.818159
2  23.579798  25.180983
3  23.910433  25.106924
4  24.829126  25.194403
Date     2015-01-07
AAPL      23.910433
GOOGL     25.106924
MSFT      39.480442
Name: 3, dtype: object
24.261045455932617 26.319652557373047
23.57757568359375 25.818159103393555
23.579797744750977 25.18098258972168
AAPL: Mean=93.94992097875841, Max=258.1037292480469, Min=20.624053955078125, Std=65.50472800790635
GOOGL: Mean=82.24142878916003, Max=196.19625854492188, Min=24.7048397064209, Std=43.516326090935245
MSFT: Mean=181.0127967976993, Max=463.240966796875, Min=34.501617431640625, Std=123.18836003648337


In [36]:
# data quality checks

# checking for missing values
if any(stock_data.isnull().sum()) != 0:
    print("Missing values found.")
# checking for NaN values
if any(stock_data.isna().sum()) != 0:
    print("NaN values found.")
# checking for duplicate dates
if stock_data.duplicated(subset=['Date']).sum() != 0:
    print("Duplicate dates found.")

# make sure data is sorted by date
stock_data.sort_values(by='Date', inplace=True)
print(stock_data.head(7))

         Date       AAPL      GOOGL       MSFT
0  2015-01-02  24.261045  26.319653  39.933064
1  2015-01-05  23.577576  25.818159  39.565838
2  2015-01-06  23.579798  25.180983  38.985107
3  2015-01-07  23.910433  25.106924  39.480442
4  2015-01-08  24.829126  25.194403  40.641872
5  2015-01-09  24.855759  24.886745  40.300274
6  2015-01-12  24.243290  24.704840  39.796417
