- Identify the data types you are working with.
- Examine the distributions of your data, numerically and/or visually.
- Identify outliers.
- Identify missing data and look for patterns of missing data.
- Describe how your EDA will inform your modeling decisions and process.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as datetime


## Import data

In [63]:
tickers = pd.read_csv('../dataset/tickers.csv')
#name the header ticker

In [66]:
tickers

Unnamed: 0,ticker
0,ABIO
1,AGTC
2,AIM
3,AIRT
4,ALSK
...,...
1943,CAGZ
1944,PTSC
1945,JUHL
1946,OPST


In [68]:
tickers['ticker'].str.contains('WNEB').any()

True

### price_volume_sorted data

In [3]:
price_volume_sorted = pd.read_csv('../dataset/price_volume_sorted.csv',  parse_dates=['date'])
price_volume_sorted.head()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated
0,AAME,2015-01-02,3.99,4.03,3.98,4.03,11443.0,0.0,4.03,2018-06-13
1,AAME,2015-01-05,3.9,4.01,3.9,4.01,13727.0,0.0,4.01,2018-06-13
2,AAME,2015-01-06,3.95,3.95,3.75,3.92,9743.0,0.0,3.92,2018-06-13
3,AAME,2015-01-07,3.899,3.92,3.87,3.92,1486.0,0.0,3.92,2018-06-13
4,AAME,2015-01-08,3.92,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13


In [4]:
price_volume_sorted.dtypes

ticker                 object
date           datetime64[ns]
open                  float64
high                  float64
low                   float64
close                 float64
volume                float64
dividends             float64
closeunadj            float64
lastupdated            object
dtype: object

In [5]:
price_volume_sorted.isnull().sum()

ticker         0
date           0
open           0
high           0
low            0
close          0
volume         0
dividends      0
closeunadj     0
lastupdated    0
dtype: int64

In [6]:
price_volume_sorted.head()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated
0,AAME,2015-01-02,3.99,4.03,3.98,4.03,11443.0,0.0,4.03,2018-06-13
1,AAME,2015-01-05,3.9,4.01,3.9,4.01,13727.0,0.0,4.01,2018-06-13
2,AAME,2015-01-06,3.95,3.95,3.75,3.92,9743.0,0.0,3.92,2018-06-13
3,AAME,2015-01-07,3.899,3.92,3.87,3.92,1486.0,0.0,3.92,2018-06-13
4,AAME,2015-01-08,3.92,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13


In [7]:
price_volume_sorted.tail()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated
2397143,ZYXI,2020-12-14,13.1,13.59,13.1,13.34,446678.0,0.0,13.34,2020-12-14
2397144,ZYXI,2020-12-15,13.45,13.45,13.03,13.18,372095.0,0.0,13.18,2020-12-15
2397145,ZYXI,2020-12-16,13.24,13.95,13.24,13.87,560581.0,0.0,13.87,2020-12-16
2397146,ZYXI,2020-12-17,13.94,14.29,13.701,14.25,409889.0,0.0,14.25,2020-12-17
2397147,ZYXI,2020-12-18,14.25,14.39,13.73,14.0,551332.0,0.0,14.0,2020-12-18


## Basic EDA and Feature Engineering

### Price
- Since we have four values of stock price for each day, let's create a feature called Price which is the average of all these values.

In [8]:
df = price_volume_sorted
#store it as df

In [9]:
values = (df['high'] + df['low'] + df['open'] + df['close'])/4
df = df.assign(Price=values)

In [10]:
df.head()


Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,Price
0,AAME,2015-01-02,3.99,4.03,3.98,4.03,11443.0,0.0,4.03,2018-06-13,4.0075
1,AAME,2015-01-05,3.9,4.01,3.9,4.01,13727.0,0.0,4.01,2018-06-13,3.955
2,AAME,2015-01-06,3.95,3.95,3.75,3.92,9743.0,0.0,3.92,2018-06-13,3.8925
3,AAME,2015-01-07,3.899,3.92,3.87,3.92,1486.0,0.0,3.92,2018-06-13,3.90225
4,AAME,2015-01-08,3.92,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13,3.93375


In [11]:
df.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,open,high,low,close,volume,dividends,closeunadj,Price
count,2397148.0,2397148.0,2397148.0,2397148.0,2397148.0,2397148.0,2397148.0,2397148.0
mean,89203.14106,94416.44721,84003.94079,87851.82232,219906.91451,0.00516,42.32264,88868.83785
std,8560189.38525,9156852.49674,8064928.76648,8443640.11737,2918537.65131,2.76311,2181.89773,8546078.74477
min,1e-05,0.0001,1e-05,0.0001,0.0,0.0,0.0,6e-05
25%,1.53,1.59,1.5,1.53,1751.0,0.0,0.86,1.535
50%,5.75,5.91,5.6,5.75,15782.0,0.0,3.56,5.75
75%,15.0,15.23,14.75,15.0,76277.0,0.0,11.37,15.0
max,1626238373.765,2049597950.406,1555678444.325,1673278326.725,955210864.0,3080.0,172250.0,1724509875.4935


In [12]:
np.percentile(df['Price'] , 47)

5.01625

Comments:
- 75% of the stocks have a price of under 15 USD, indicating that this segment of the stock market is mostly dominated by companies with low stock price.
- 25% of the stocks have a price of under 1.53 USD, which is below the price at which some academics would consider to be penny stocks (i.e. stocks less than 5 USD)
- roughly 47% of stocks in this universe would be considered as penny stocks as they are 5 USD or less


In [13]:
df.head()


Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,Price
0,AAME,2015-01-02,3.99,4.03,3.98,4.03,11443.0,0.0,4.03,2018-06-13,4.0075
1,AAME,2015-01-05,3.9,4.01,3.9,4.01,13727.0,0.0,4.01,2018-06-13,3.955
2,AAME,2015-01-06,3.95,3.95,3.75,3.92,9743.0,0.0,3.92,2018-06-13,3.8925
3,AAME,2015-01-07,3.899,3.92,3.87,3.92,1486.0,0.0,3.92,2018-06-13,3.90225
4,AAME,2015-01-08,3.92,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13,3.93375


In [27]:
len(df.ticker.unique())

1948

In [98]:
tickers.head(10)

Unnamed: 0,ticker
0,ABIO
1,AGTC
2,AIM
3,AIRT
4,ALSK
5,ALT
6,AVGR
7,BCDA
8,BEAT
9,BLIN


### Growth

In [70]:
 df.ticker.unique()

array(['AAME', 'AAOI', 'AAPC', ..., 'ZQKSQ', 'ZSAN', 'ZYXI'], dtype=object)

In [71]:
tickers['ticker'].to_numpy()

array(['ABIO', 'AGTC', 'AIM', ..., 'JUHL', 'OPST', 'IDAH'], dtype=object)

In [77]:
df.date.min()

Timestamp('2015-01-02 00:00:00')

In [None]:
why is BCAR in stock_names but not price_mapping?
#i think its because the starting date is different for some stocks

In [86]:
stock_names = tickers['ticker'].to_numpy()
#store the stock ticker as stock_names

day_prices = df[df.date == df.date.min()].Price
# find the day prices at the start of our time period which is 2015-01-02

price_mapping = {n : c for n, c in zip(stock_names, day_prices)}
# Map the price of each stock to that


In [115]:
if 'BCAR' in stock_names:
  print("yes")
else:
  print("no")

yes


In [100]:
price_mapping

{'ABIO': 4.0075,
 'AGTC': 11.0175,
 'AIM': 1.77,
 'AIRT': 1.6625,
 'ALSK': 12.54,
 'ALT': 3.3425,
 'AVGR': 0.28750000000000003,
 'BCDA': 127.071,
 'BEAT': 0.13,
 'BLIN': 63.542,
 'BMTM': 4.0275,
 'CASI': 0.755,
 'CBAT': 5065.55425,
 'CDZI': 17.2375,
 'CEI': 4.49,
 'CJJD': 21.2625,
 'CLIR': 6.7749999999999995,
 'CLRB': 12.325,
 'COHN': 3.0,
 'CPHC': 20.0425,
 'CRTD': 2.42525,
 'CVV': 4.38,
 'CYCC': 8.52275,
 'CYTK': 8.6475,
 'DSS': 0.63625,
 'ENZ': 11.350000000000001,
 'FDUS': 2.09,
 'FKWL': 6.0975,
 'FORD': 0.24,
 'GV': 4.435,
 'HALL': 23.977499999999996,
 'HBIO': 316.2,
 'IBIO': 49.129999999999995,
 'IGXT': 0.11,
 'INSE': 2.69,
 'MFIN': 58.089999999999996,
 'MKTY': 0.004,
 'MLR': 30.3175,
 'MNTX': 0.08,
 'MPB': 2.3875,
 'MVBF': 25.4,
 'MVC': 1.24,
 'NANX': 16.677500000000002,
 'NBSE': 0.004,
 'NLTX': 8.3475,
 'OPTT': 1.75,
 'PEIX': 6.2425,
 'PFBX': 21.2025,
 'PRCP': 12.4305,
 'PRKR': 10.3875,
 'PTE': 9.5075,
 'PW': 3.5250000000000004,
 'RAIL': 1.06,
 'RELV': 87.01024999999998,
 'RMNI'

In [107]:
df

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,Price
0,AAME,2015-01-02,3.990,4.03,3.980,4.03,11443.0,0.0,4.03,2018-06-13,4.00750
1,AAME,2015-01-05,3.900,4.01,3.900,4.01,13727.0,0.0,4.01,2018-06-13,3.95500
2,AAME,2015-01-06,3.950,3.95,3.750,3.92,9743.0,0.0,3.92,2018-06-13,3.89250
3,AAME,2015-01-07,3.899,3.92,3.870,3.92,1486.0,0.0,3.92,2018-06-13,3.90225
4,AAME,2015-01-08,3.920,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13,3.93375
...,...,...,...,...,...,...,...,...,...,...,...
2397143,ZYXI,2020-12-14,13.100,13.59,13.100,13.34,446678.0,0.0,13.34,2020-12-14,13.28250
2397144,ZYXI,2020-12-15,13.450,13.45,13.030,13.18,372095.0,0.0,13.18,2020-12-15,13.27750
2397145,ZYXI,2020-12-16,13.240,13.95,13.240,13.87,560581.0,0.0,13.87,2020-12-16,13.57500
2397146,ZYXI,2020-12-17,13.940,14.29,13.701,14.25,409889.0,0.0,14.25,2020-12-17,14.04525


In [109]:
df[df['ticker'] =='BCAR']

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,Price
229311,BCAR,2015-01-02,0.140,0.140,0.130,0.140,44080.0,0.0,0.140,2018-06-13,0.1375
229312,BCAR,2015-01-05,0.140,0.150,0.135,0.135,101798.0,0.0,0.135,2018-06-13,0.1400
229313,BCAR,2015-01-06,0.140,0.140,0.135,0.135,42469.0,0.0,0.135,2018-06-13,0.1375
229314,BCAR,2015-01-07,0.140,0.150,0.140,0.140,606992.0,0.0,0.140,2018-06-13,0.1425
229315,BCAR,2015-01-08,0.140,0.140,0.140,0.140,69800.0,0.0,0.140,2018-06-13,0.1400
...,...,...,...,...,...,...,...,...,...,...,...
229453,BCAR,2015-07-28,0.135,0.135,0.135,0.135,0.0,0.0,0.135,2018-10-16,0.1350
229454,BCAR,2015-07-29,0.135,0.135,0.135,0.135,5000.0,0.0,0.135,2018-06-13,0.1350
229455,BCAR,2015-07-30,0.134,0.134,0.134,0.134,1500.0,0.0,0.134,2018-06-13,0.1340
229456,BCAR,2015-07-31,0.134,0.134,0.134,0.134,1150.0,0.0,0.134,2018-06-13,0.1340


In [126]:
df.head(100)


Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated,Price
0,AAME,2015-01-02,3.99,4.03,3.98,4.03,11443.0,0.0,4.03,2018-06-13,4.0075
1,AAME,2015-01-05,3.9,4.01,3.9,4.01,13727.0,0.0,4.01,2018-06-13,3.955
2,AAME,2015-01-06,3.95,3.95,3.75,3.92,9743.0,0.0,3.92,2018-06-13,3.8925
3,AAME,2015-01-07,3.899,3.92,3.87,3.92,1486.0,0.0,3.92,2018-06-13,3.90225
4,AAME,2015-01-08,3.92,3.95,3.915,3.95,2200.0,0.0,3.95,2018-06-13,3.93375
5,AAME,2015-01-09,3.95,4.01,3.94,4.0,3651.0,0.0,4.0,2018-06-13,3.975
6,AAME,2015-01-12,4.0,4.0,4.0,4.0,1027.0,0.0,4.0,2018-06-13,4.0
7,AAME,2015-01-13,3.91,4.0,3.91,4.0,1311.0,0.0,4.0,2018-06-13,3.955
8,AAME,2015-01-14,3.89,4.0,3.89,4.0,220.0,0.0,4.0,2018-06-13,3.945
9,AAME,2015-01-15,3.9,4.0,3.8,3.94,8475.0,0.0,3.94,2018-06-13,3.91


In [119]:
df.isnull().sum()

ticker         0
date           0
open           0
high           0
low            0
close          0
volume         0
dividends      0
closeunadj     0
lastupdated    0
Price          0
dtype: int64

In [116]:
if 'BCAR' in price_mapping:
  print("yes")
else:
  print("no")

no


In [104]:
base_mapping = np.array(list(map(lambda x : price_mapping[x], df['ticker'].values)))
df['Growth'] = df['Price'] / base_mapping - 1

#how do i make lambda pass if the ticker doesn't exist at the time i specify?

KeyError: 'BCAR'

## Time Series Analysis
- we want to find out the top 5 best and worst performing stocks during our chosen time period


In [14]:
sample_dates = pd.date_range(start='2015-01-02', end='2020-12-18', freq='B')


In [15]:
year_end_dates = sample_dates[sample_dates.is_year_end]
year_end_dates
# we take then end of the year for each period, except for the year 

DatetimeIndex(['2015-12-31', '2016-12-30', '2017-12-29', '2018-12-31',
               '2019-12-31'],
              dtype='datetime64[ns]', freq=None)

In [16]:
type(year_end_dates)

pandas.core.indexes.datetimes.DatetimeIndex

In [17]:
 need to add this into year_end_dates('2020-12-18')

SyntaxError: invalid syntax (<ipython-input-17-89c9475ab75f>, line 1)

In [None]:
worst_stocks = price_volume_sorted[price_volume_sorted.date == price_volume_sorted.date.max()].sort_values('Growth').head(5)


In [None]:
price_volume_sorte

In [None]:
price_volume_sorted.set_index('date', inplace=True)
