# Data Correction [stock splits]

While exploring the data, I've noticed that prices were not adjusted for stock split events. The stock splits will affect calculation of any ratios based on prices.

In this notebook, I will adjust the prices for stock splits and save the the data to a new file endig with 'v1'

In [1]:
# Importing the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load the data

filepath_finance_data = '../Data/Datasets/updated_finance_data.csv'
finance_data = pd.read_csv(filepath_finance_data)

filepath_split_data = '../Data/Datasets/split_history.csv'

split_data = pd.read_csv(filepath_split_data)


In [3]:
# Checking the data
# ordering all columns in alphabetical order

finance_data = finance_data.reindex(sorted(finance_data.columns), axis=1)

# for easier viewing, I will make sure all float values are rounded to 2 decimal places and have commas

pd.options.display.float_format = '{:,.2f}'.format

# sorting the finance data by symbol (ascending) and date_x (ascending)

finance_data = finance_data.sort_values(by=['symbol', 'date_x'], ascending=[True, True])

# viewing the first 10 rows of the data

finance_data.head(10)

Unnamed: 0,addTotalDebt,assetGrowth,assetTurnover,averageInventory,averagePayables,averageReceivables,beta,bookValuePerShare,bookValueperShareGrowth,calendarYear,...,threeYNetIncomeGrowthPerShare,threeYOperatingCFGrowthPerShare,threeYRevenueGrowthPerShare,threeYShareholdersEquityGrowthPerShare,totalDebtToCapitalization,updatedFromDate,weightedAverageSharesDilutedGrowth,weightedAverageSharesGrowth,workingCapital,zip
39,36403000000,0.13,0.28,2197000000.0,34098500000.0,17084500000.0,1.24,5.28,0.12,2015,...,0.54,1.14,0.8,0.53,0.23,2024-11-25,-0.02,-0.02,9792000000.0,95014
38,43871000000,-0.0,0.22,2339500000.0,30580000000.0,13807000000.0,1.24,5.57,0.05,2015,...,0.32,0.54,0.67,0.42,0.25,2024-11-25,-0.01,-0.01,9162000000.0,95014
37,54418000000,0.05,0.18,2219000000.0,24816500000.0,10637500000.0,1.24,5.48,-0.01,2015,...,0.38,0.68,0.62,0.29,0.3,2024-11-25,-0.01,-0.01,5668000000.0,95014
36,64328000000,0.06,0.18,2195500000.0,30982000000.0,13609500000.0,1.24,5.28,-0.04,2015,...,0.57,0.72,0.67,0.17,0.35,2024-11-25,-0.02,-0.01,8768000000.0,95014
35,62963000000,0.01,0.26,2400000000.0,34401000000.0,14901000000.0,1.24,5.77,0.09,2016,...,0.66,0.39,0.65,0.19,0.33,2024-11-25,-0.02,-0.02,127000000.0,95014
34,79872000000,0.04,0.17,2366000000.0,29205000000.0,12591000000.0,1.24,5.91,0.03,2016,...,0.31,0.11,0.38,0.15,0.38,2024-11-25,-0.01,-0.01,19327000000.0,95014
33,84935000000,0.0,0.14,2056000000.0,25708000000.0,11971500000.0,1.24,5.81,-0.02,2016,...,0.33,0.6,0.42,0.21,0.4,2024-11-25,-0.01,-0.01,22275000000.0,95014
32,87032000000,0.05,0.15,1981500000.0,31806000000.0,20506500000.0,1.24,5.97,0.03,2016,...,0.42,0.92,0.47,0.22,0.4,2024-11-25,-0.01,-0.01,27863000000.0,95014
31,87549000000,0.03,0.24,2422000000.0,37902000000.0,28638000000.0,1.24,6.25,0.05,2017,...,0.62,0.42,0.61,0.21,0.4,2024-11-25,-0.01,-0.01,19202000000.0,95014
30,98522000000,0.01,0.16,2811000000.0,33541500000.0,24294500000.0,1.24,6.41,0.03,2017,...,0.26,0.09,0.36,0.31,0.42,2024-11-25,-0.01,-0.01,28648000000.0,95014


In [4]:
# removing any rows from stock split data where the date is less than 2014-01-01

split_data = split_data[split_data['date'] >= '2013-01-01']

split_data

Unnamed: 0,date,label,numerator,denominator,symbol
0,2020-08-31,"August 31, 20",4.00,1.00,AAPL
1,2014-06-09,"June 09, 14",7.00,1.00,AAPL
5,2013-01-02,"January 02, 13",5000.00,2399.00,ABT
37,2014-10-01,"October 01, 14",1139.00,1000.00,ADP
50,2018-03-19,"March 19, 18",2.00,1.00,AFL
...,...,...,...,...,...
911,2013-12-02,"December 02, 13",313.00,250.00,TT
916,2021-06-17,"June 17, 21",10.00,1.00,TTD
937,2014-06-09,"June 09, 14",2.00,1.00,UNP
943,2015-03-19,"March 19, 15",4.00,1.00,V


In [5]:
# Create a new column called 'adjustment_factor' which is the ratio calculated as 'denominator' / 'numerator'
split_data.loc[:, 'adjustment_factor'] = split_data['denominator'] / split_data['numerator']

# drop all columns except 'symbol', 'date', 'adjustment_factor'

split_data = split_data[['symbol', 'date', 'adjustment_factor']]

split_data

Unnamed: 0,symbol,date,adjustment_factor
0,AAPL,2020-08-31,0.25
1,AAPL,2014-06-09,0.14
5,ABT,2013-01-02,0.48
37,ADP,2014-10-01,0.88
50,AFL,2018-03-19,0.50
...,...,...,...
911,TT,2013-12-02,0.80
916,TTD,2021-06-17,0.10
937,UNP,2014-06-09,0.50
943,V,2015-03-19,0.25
