## First off, lets get all of our imports out of the way

In [1]:
import sklearn 
import pandas 
import numpy
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *
from sklearn.metrics import mean_absolute_error as mae

### Now, we can load all of our datasets and start a global set

In [2]:
# Covid data 
covid_2021 = open("counties21.csv",'r').read().split('\n')[1:]
covid_2022 = open("counties22.csv",'r').read().split('\n')[1:]

# Supporting, predictive data 
poverty_data = pandas.DataFrame(pandas.read_excel("county_poverty.xls",skiprows=3))
mask_data = pandas.DataFrame(pandas.read_csv("county_masks.csv",dtype={'COUNTYFP':str,"NEVER":float,"RARELY":float,"SOMETIMES":float,"FREQUENTLY":float,"ALWAYS":float}))
pop_data = pandas.DataFrame(pandas.read_csv("county_pop.csv",encoding='big5'))

This initializes dictionaries to start collecting all of our information for the final dataset 

In [3]:
dataset = {}
county_info = {}

### Now lets fill the dictionaries with helpful data
First off is the covid statistics themselves, including avg_100k cases as well

In [4]:
# CLEAN COVID DATA 
for line in covid_2021 + covid_2022:
    line = line.split(",")
    date = line[0]
    FIPS = line[1][-5:]
    
    # Ensure the FIPS code is there 
    if not FIPS in dataset:
        dataset[FIPS] = {}
    
    # Ensure the FIPS code is in county_info
    if not FIPS in county_info:
        county_info[FIPS] = {}
        
    # Add the data
    avg_cases = float(line[5])
    try:
        avg_per_100k = float(line[6])
    except ValueError:
        avg_per_100k = 0
    dataset[FIPS][date] = {'FIPS': FIPS, 'date':date,'avg_cases':avg_cases,'avg_per_100k':avg_per_100k}
    

This will get adj county data, mapping a fips code to a list of fips codes 

In [5]:
# CLEAN ADJ DATA
adj_counties = {}
cur_key = None
for line in open("county_adjacency.txt",'r',encoding="iso-8859-1").read().split('\n'):
    try:
        if not line[0] == '\t':
            try:
                line = line.split('\t')
                cur_key = line[1].strip()
                adj_counties[cur_key] = []
            except IndexError:
                pass
        else:
            line=line.split('\t')
            county_fips = line[-1].strip()
            adj_counties[cur_key].append(county_fips)
    except IndexError:
        pass 

Next up is population data, which will fill the county dataset dictionary since this does not change over time (relatively)

In [6]:
   
# CLEAN POP DATA 
for state,county,pop in zip(pop_data["STATE"],pop_data['COUNTY'], pop_data['TOT_POP']):
    state = f"{state:002}"
    county = f"{county:003}"
    fips = state + county
    
    pop = int(pop)
    
    # Make sure its in the covid dataset, otherwise ignore it 
    if not fips in dataset or not fips in county_info:
        continue
        
    # Make a running total of the population
    try:
        county_info[fips]['population'] += pop
    except KeyError:
        county_info[fips]['population'] = float(pop)

This next section finds mask usage data, which will go into the county specific dictionary since it also is constant accross the dates 

In [7]:
    
# CLEAN MASKS 
for r in mask_data.iterrows():
    fips,never,rare,some,freq,always = r[1]
    high = freq+always 
    med = some
    low = never + rare
    
    if not fips in dataset or not fips in county_info:
        continue
    
    county_info[fips]['low_mask'] = low 
    county_info[fips]['med_mask'] = med 
    county_info[fips]['high_mask'] = high

Next, well find the county with the highest 100k average cases for a given date

In [8]:
r_count = 0 
# FIND ADJ COUNTY HIGHEST 100k_avg
for i,county in enumerate(dataset,1):
    for date in dataset[county]:
        try:
            # Get adjacent counties and their avg 
            counties = {c:0 for c in adj_counties[county]}
            max_100k = 0
            for c in counties:
                count = dataset[c][date]['avg_per_100k']
                if count > max_100k:
                    max_100k = count 
            r_count += max_100k
            dataset[county][date]['max_adj_100k'] = max_100k

        except KeyError:
            dataset[county][date]['max_adj_100k'] = r_count / i# Use the latest average data

We will now fill poverty data

In [9]:
for state,county,poverty in zip(poverty_data["State FIPS Code"],poverty_data['County FIPS Code'], poverty_data['Poverty Estimate, All Ages']):
    state = f"{state:002}"
    county = f"{county:003}"
    fips = state + county
    try:
        county_info[fips]['poverty'] = poverty / county_info[fips]['population']
    except KeyError:
        pass
    except TypeError:
        pass

### Here are some nice helper functions

In [10]:
# A nice way to add 14 days to a current date (as a string)
def two_weeks_after(date):
    d2 = datetime.strptime(date, "%Y-%m-%d")
    delta = d2+timedelta(days=14)
    return delta.strftime('%Y-%m-%d')
def days_since_start(date):
    delta = datetime.strptime(date,"%Y-%m-%d") - datetime(2021,1,1)
    return delta.days

### Create the Final Dataset

In [13]:
final_data = []
predicting = []
for c in dataset:
    for d in dataset[c]: 
        try:
            # Add the actual 2-week covid numbers
            future_date = two_weeks_after(d)
            dataset[c][d]['2_week_cases'] = dataset[c][future_date]['avg_cases']
            actual_rate = dataset[c][d]['2_week_cases']
            
            # Add to the final list 
            FIPS = dataset[c][d]['FIPS']
            
            # Date specific info
            datapoint = [actual_rate,int(FIPS),days_since_start(dataset[c][d]['date']),dataset[c][d]['avg_cases'],dataset[c][d]['avg_per_100k'],dataset[c][d]['max_adj_100k']]
            
            # County Specific info
            datapoint += [county_info[FIPS]['poverty'],county_info[fips]['low_mask'],county_info[fips]['med_mask'],county_info[fips]['high_mask'],county_info[fips]['population']]          
            
            final_data.append(datapoint)
            
        # This acounts for the last 2 weeks of data 
            #if c == '51059' and (datetime.strptime(d, "%Y-%m-%d") - datetime.strptime("2022-03-20","%Y-%m-%d")).days < 14:
             #   predicting.append([actual_rate,int(FIPS),days_since_start(dataset[c][d]['date']),dataset[c][d]['avg_cases'],dataset[c][d]['avg_per_100k'],dataset[c][d]['max_adj_100k']])
                # County Specific info
              #  predicting[-1] += [county_info[FIPS]['poverty'],county_info[fips]['low_mask'],county_info[fips]['med_mask'],county_info[fips]['high_mask'],county_info[fips]['population']]          
        
        except KeyError:
            if c == '51059':
                predicting.append([actual_rate,int(FIPS),days_since_start(dataset[c][d]['date']),dataset[c][d]['avg_cases'],dataset[c][d]['avg_per_100k'],dataset[c][d]['max_adj_100k']])
                # County Specific info
                predicting[-1] += [county_info[FIPS]['poverty'],county_info[fips]['low_mask'],county_info[fips]['med_mask'],county_info[fips]['high_mask'],county_info[fips]['population']]          
np_arr = numpy.array([numpy.array(data) for data in final_data])
        

In [14]:
numpy.save("covid_clean_data",np_arr)

### Now we start the Machine Learning portion!

Lets split our data up - 80% to train, 20% to test 

In [None]:
# First make a train test split 
numpy.random.shuffle(np_arr)
size = int(len(np_arr)*.8)
x_train,y_train,x_test,y_test = np_arr[:size,1:],np_arr[:size,0],np_arr[size:,1:],np_arr[size:,0]

This is our model pipeline

In [None]:
# Now init our model
pipe=make_pipeline(PolynomialFeatures(degree=4),StandardScaler(),Ridge(alpha=0.01))
# And throw our data into it 
pipe.fit(x_train,y_train)

Lets see how it did!

In [None]:
testYs = pipe.predict(x_test)
print(f"model mean absolute error: {mae(y_test,testYs)}")

Now lets get predictions for Fairfax county (where Im from!)

In [2]:
predicting_vals = pipe.fit(predicting)

print(f"fairfax county predictions: {predicting_vals}")

NameError: name 'pipe' is not defined