In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mlxtend.plotting import plot_pca_correlation_graph
import numpy as np
import plotly.express as px
import requests

In [13]:
# read the csv file from local and skip the not necessary infomation in the file
Covid_Cases = pd.read_csv('./data/COVID-19 Cases.csv', skiprows= [0,1,2,3,4])
Covid_Cases = Covid_Cases.rename({'Data': 'Cases'}, axis=1)
Covid_Cases = Covid_Cases[Covid_Cases['DataFormat'] == 'Number' ]

Covid_Hospitalizations = pd.read_csv('./data/COVID-19 Hospitalizations.csv', 
                                     skiprows= [0,1,2,3,4])
Covid_Hospitalizations = Covid_Hospitalizations.rename({'Data': 'Hospitalizations'}, axis=1)
Covid_Hospitalizations = Covid_Hospitalizations[Covid_Hospitalizations['DataFormat'] == 'Number' ]

Covid_Deaths = pd.read_csv('./data/COVID-19 Deaths.csv', skiprows= [0,1,2,3,4])
Covid_Deaths = Covid_Deaths.rename({'Data': 'Deaths'}, axis=1)
Covid_Deaths = Covid_Deaths[Covid_Deaths['DataFormat'] == 'Number' ]

Covid = pd.merge(Covid_Cases, Covid_Hospitalizations,on = [ "TimeFrame","Location","Fips"] )

Covid = pd.merge(Covid, Covid_Deaths,on = [ "TimeFrame","Location","Fips"] )
Covid
# Local Area Unemployment Statistics from https://data.bls.gov/cgi-bin/surveymost

# Get the features
features = ['Cases','Hospitalizations','Deaths','labor force','employment',
           'unemployment']


In [14]:
Covid = Covid[Covid['Location'] != 'New York City' ]
Covid

Unnamed: 0,Location,TimeFrame,DataFormat_x,Cases,Fips,DataFormat_y,Hospitalizations,DataFormat,Deaths
13,Bronx,05/19,Number,43508.0,36005,Number,11420.0,Number,3409.0
14,Brooklyn,05/19,Number,52347.0,36047,Number,13620.0,Number,4894.0
15,Manhattan,05/19,Number,23790.0,36061,Number,7423.0,Number,2173.0
16,Queens,05/19,Number,58921.0,36081,Number,15931.0,Number,4801.0
17,Staten Island,05/19,Number,12988.0,36085,Number,2199.0,Number,770.0
...,...,...,...,...,...,...,...,...,...
223,Bronx,1/27,Number,99693.0,36005,Number,16768.0,Number,4415.0
224,Brooklyn,1/27,Number,144869.0,36047,Number,22329.0,Number,6555.0
225,Manhattan,1/27,Number,69173.0,36061,Number,11081.0,Number,2861.0
226,Queens,1/27,Number,147389.0,36081,Number,23622.0,Number,6762.0


In [24]:
# api_covid = 'https://data.cityofnewyork.us/resource/rc75-m7u3.csv'
# df_covid = pd.read_csv(api_covid)

df_covid = pd.read_csv('./data/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv')
# Create df_covid

df_covid.head()


Unnamed: 0,DATE_OF_INTEREST,CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,DEATH_COUNT_PROBABLE,CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_HOSPITALIZED_COUNT,...,QN_CASE_COUNT_7DAY_AVG,QN_HOSPITALIZED_COUNT_7DAY_AVG,QN_DEATH_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,INCOMPLETE
0,02/29/2020,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,03/01/2020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,03/02/2020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,03/03/2020,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,03/04/2020,5,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Unemployment data
#API link from:
#https://labor.ny.gov/stats/LSLAUS.shtm
# api_labor = 'https://www.labor.ny.gov/stats/lausCSV.asp?PASS=1&geog=21093561'
# data_labor = requests.get(api_labor).text

# With downloaded labor file
try:
    with open('./data/lausCSV.csv', 'r') as f:
        data_labor = f.read()
except IOError:
    print('Error: Labor data file not found')
# Grab rows with dataset title, column names, and two years
select_file_rows = lambda lines: [lines[idx-2:idx+2] for idx, line in enumerate(lines) if line.startswith('2020')]

# Split row strings into lists, reverse the order (old to new), remove three columns (year, annual average, and trailing empty column)
split_rows_into_list_by_year = lambda rows: {
    variable[0].strip().title(): {l[0]: l[1:-2] for l in list(map(lambda x: x.strip().split(','), variable[-1:1:-1]))}
    for variable in rows
}

# Concat years into one list per variable all in one dict
concat_years_into_one_list = lambda variables: {
    variable_name: [value for lst in d.values() for value in lst]
    for variable_name, d in variables.items()
}

# Create dataframe from dict of variables
def create_df(variables):
    df = pd.DataFrame(
        variables,
        index=pd.date_range(start='2019-01-01', end='2020-12-31', freq='m')
    )
    # Remove NaN values from dataframe
    df = df.replace('', np.nan).dropna()
    
    # Fix one column's datatype
    df['Unemployment Rate'] = df['Unemployment Rate'].transform(
        lambda x: float(x.strip('%')) / 100 if type(x) is str and x else np.nan,
        axis=0
    )
    
    # Fix type of remaining columns
    types = {
        'Unemployment Rate': float,
        'Labor Force': int,
        'Employed': int,
        'Unemployed': int,
    }
    df = df.astype(types)
    
    return df

df_labor = create_df(
    concat_years_into_one_list(
        split_rows_into_list_by_year(
            select_file_rows(
                data_labor.split('\n')))))

# Note that all columns are object type

display(df_labor.head())



Unnamed: 0,Unemployment Rate,Labor Force,Employed,Unemployed
2019-01-31,0.048,9515500,9062800,452700
2019-02-28,0.045,9542000,9108400,433700
2019-03-31,0.042,9529100,9128600,400600
2019-04-30,0.036,9449400,9107200,342200
2019-05-31,0.036,9457200,9112100,345100


In [10]:
# Set up the target or y for LDA training 
y = Unemployment.loc[:,['Year']].values
# Target name
targets = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
for n in range(0,len(y)):
    if y[n] == targets[0]:
        y[n] = 0
    elif y[n] == targets[1]:
        y[n] = 1
    elif y[n] == targets[2]:
        y[n] = 2
    elif y[n] == targets[3]:
        y[n] = 3
    elif y[n] == targets[4]:
        y[n] = 4
    elif y[n] == targets[5]:
        y[n] = 5
    elif y[n] == targets[6]:
        y[n] = 6
    elif y[n] == targets[7]:
        y[n] = 7
    elif y[n] == targets[8]:
        y[n] = 8
    elif y[n] == targets[9]:
        y[n] = 9
    elif y[n] == targets[10]:
        y[n] = 10

# set to contiguous flattened array
y = y.ravel()
# set the data type to integer
y = y.astype('int')

NameError: name 'Unemployment' is not defined

In [11]:
# Separate the features from the dataset
X = Unemployment.loc[:,features].values

target_names = targets
# Principal component analysis
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))
#print(pca.explained_variance_ratio_.cumsum())
plt.figure()
colors = ['r', 'g', 'b', 'y','m','crimson','khaki','azure' ,'black','grey','cyan']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2, 3, 4 ,5,6,7,8,9,10], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of Unemployment dataset')

plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2, 3, 4 ,5,6,7,8,9,10], target_names):
    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of Unemployment dataset')

plt.show()




NameError: name 'Unemployment' is not defined

In [None]:





df = Unemployment


pca = PCA()
components = pca.fit_transform(df[features])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=df["Year"]
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
X_norm = X / X.std(axis=0) # Normalizing the feature columns is recommended


figure, correlation_matrix = plot_pca_correlation_graph(X_norm, 
                                                        features,
                                                        dimensions=(1, 2),
                                                        figure_axis_size=10)


In [None]:
correlation_matrix

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output




In [None]:

df = Unemployment
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(id="graph"),
    html.P("Number of components:"),
    dcc.Slider(
        id='slider',
        min=2, max=5, value=3,
        marks={i: str(i) for i in range(2,6)})
])

In [None]:
# @app.callback(
#     Output("graph", "figure"), 
#     [Input("slider", "value")])
# def run_and_plot(n_components):



#     pca = PCA(n_components=n_components)
    
#     components = pca.fit_transform(df[features])
#     var = pca.explained_variance_ratio_.sum() * 100

#     labels = {str(i): f"PC {i+1}" 
#               for i in range(n_components)}
#     labels['color'] = 'Median Price'
   

#     fig = px.scatter_matrix(
#         components,
#         color=df["Year"],
#         dimensions=range(n_components),
#         labels=labels,
#         title=f'Total Explained Variance: {var:.2f}%')
#     fig.update_traces(diagonal_visible=False)
#     return fig
# app.run_server(debug= False)


In [None]:

Unemployment  = pd.read_csv('./data/SeriesReport-20210301115700_003b1e.csv',skiprows= 10)