# Data Preperation

### Goals
- Deal with missing data
- Index dates
- Transform data to nice numpy arrays

In [91]:
import pandas as pd
import datetime

In [92]:
pd.read_csv("raw_data_covid.csv").head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/26/22,2/27/22,2/28/22,3/1/22,3/2/22,3/3/22,3/4/22,3/5/22,3/6/22,3/7/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,173146,173395,173659,173879,174073,174214,174214,174331,174582,175000
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,271141,271527,271563,271702,271825,271825,272030,272030,272210,272250
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,264778,264855,264936,265010,265079,265130,265186,265227,265265,265297
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,37999,37999,37999,38165,38249,38342,38434,38434,38434,38620
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,98701,98701,98741,98746,98746,98746,98796,98796,98806,98806


In [93]:
pd.read_csv("raw_data_omxh.csv").head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-22,4365.709961,4377.720215,4343.870117,4352.100098,4352.100098,42163100.0
1,2020-01-23,4350.950195,4359.919922,4303.509766,4324.77002,4324.77002,41634900.0
2,2020-01-24,4323.339844,4370.490234,4323.339844,4344.259766,4344.259766,42872400.0
3,2020-01-27,4327.450195,4327.450195,4240.470215,4246.680176,4246.680176,49010300.0
4,2020-01-28,4251.25,4336.950195,4234.939941,4323.930176,4323.930176,46380700.0


In [94]:
# Read the files
df_covid = pd.read_csv("not_so_raw_data_covid.csv") #The files has been prepared a little with excel (Deleted other region cases, Transposed erc.)
df_omxh = pd.read_csv("raw_data_omxh.csv").drop(columns=["Open", "High", "Low", "Adj Close", "Volume"])


# Format the dates
df_covid = df_covid.apply(lambda x: [datetime.datetime.strptime(x[0],"%m/%d/%y").strftime("%Y-%m-%d"), x[1]] ,axis=1, result_type="expand")
df_covid = df_covid.rename(columns={0 : "Date", 1 : "Cases"})

# Join the data frames
df = df_covid.join(df_omxh.set_index("Date"), on="Date")

df

Unnamed: 0,Date,Cases,Close
0,2020-01-22,0,4352.100098
1,2020-01-23,0,4324.770020
2,2020-01-24,0,4344.259766
3,2020-01-25,0,
4,2020-01-26,0,
...,...,...,...
771,2022-03-03,664588,
772,2022-04-03,671773,
773,2022-05-03,671773,
774,2022-06-03,671773,


In [95]:
# Clear the points with missing data
df = df.dropna()

# Reset the index so we can use the indexes as time
df = df.reset_index()

# Drop the Date culumn
df = df.drop(columns="Date")

df

Unnamed: 0,index,Cases,Close
0,0,0,4352.100098
1,1,0,4324.770020
2,2,0,4344.259766
3,5,0,4246.680176
4,6,0,4323.930176
...,...,...,...
500,756,592765,5229.529785
501,757,600718,5151.890137
502,758,605046,5115.390137
503,769,657443,5595.779785


In [96]:
# Get the last week Close value and join it
last_wk_close = df.apply(lambda x: [ x[0] + 7, x[1], x[2] ], axis=1, result_type="expand")

# Join the data frames, drop old cases, drop missing, rename columns
df = df.join(last_wk_close.set_index(0), on="index").drop(columns=1).dropna().rename(columns={"index" : "Time", 2 : "old_Close"})

df.head(20)

Unnamed: 0,Time,Cases,Close,old_Close
5,7,1,4315.410156,4352.100098
6,8,1,4322.740234,4324.77002
7,9,1,4300.080078,4344.259766
8,12,1,4041.560059,4246.680176
9,13,1,3294.219971,4323.930176
10,15,1,4033.98999,4322.740234
11,16,1,3993.01001,4300.080078
13,19,1,4317.410156,4041.560059
14,20,2,4105.879883,3294.219971
16,22,2,4548.319824,4033.98999


In [97]:
# Export to .csv file
df.to_csv("cleaned_data.csv", index=False)