# **Data Cleaning - Unemployment.xlsx - UnemploymentMedianIncome.csv**

# **Import Modules**

In [39]:
#### Import the libraries needed
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import os
import glob

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore')
%matplotlib notebook
%matplotlib inline

# **Set Environment**

In [40]:
import plotly.io as pio
pio.renderers.default = "vscode"

In [41]:
# Set up directory
working_directory = Path.cwd()
# src = working_directory.parents[1]
# data directory
raw_data_directory = working_directory / 'data' / 'raw'
processed_data_directory = working_directory / 'data' / 'processed'
final_data_directory = working_directory / 'data' / 'final'

In [42]:
# Set pd.options to add slide bars
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [43]:
# Set default title color
plt.style.use('fivethirtyeight')

# set default plt figure size
plt.rcParams["figure.figsize"] = [10, 5]
# suptitle
plt.rcParams["figure.titlesize"] = 22
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams['text.color'] = '#333333'
# title
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlelocation"] = "left"
plt.rcParams['axes.titlecolor'] = '#333333'
# label
plt.rcParams["axes.labelsize"] = 12
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['axes.labelcolor'] = '#333333'
# spines
plt.rcParams["axes.spines.bottom"] = True
plt.rcParams["axes.spines.left"] = True
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
# tick
plt.rcParams['xtick.color'] = "#333333"
plt.rcParams['ytick.color'] = "#333333"
# line width
plt.rcParams['lines.linewidth'] = 1

# **Load Unemployment.xlsx - UnemploymentMedianIncome.csv**
* 3277 rows and 100 columns
* Aggregated by the country(1), state(52), and county level
* Remove ',' and convert from string to float data type
* Dataset includes Port Rico --> delete
* Create a variable 'is_county'
* Remove "County" and the state abbrebiation Ex. Autauga County, AL --> Autauga
* Change the column name from "Area_Name" to "county"
* 149 rows contains at least on NaN
* Delete unemployment-related data before 2014
* Data Dictionary: UrbanInfluenceCodes2013.xls, Ruralurbancontinuumcodes2023

In [93]:
# Convert the 'date' columns to datetime data type
file = os.path.join(raw_data_directory, 'Unemployment.xlsx - UnemploymentMedianIncome.csv')
unemployment_df = pd.read_csv(file, skiprows=4)

In [94]:
unemployment_df.shape

(3277, 100)

In [95]:
unemployment_df.head()

Unnamed: 0,FIPS_Code,State,Area_Name,Rural_Urban_Continuum_Code_2013,Urban_Influence_Code_2013,Metro_2013,Civilian_labor_force_2000,Employed_2000,Unemployed_2000,Unemployment_rate_2000,Civilian_labor_force_2001,Employed_2001,Unemployed_2001,Unemployment_rate_2001,Civilian_labor_force_2002,Employed_2002,Unemployed_2002,Unemployment_rate_2002,Civilian_labor_force_2003,Employed_2003,Unemployed_2003,Unemployment_rate_2003,Civilian_labor_force_2004,Employed_2004,Unemployed_2004,Unemployment_rate_2004,Civilian_labor_force_2005,Employed_2005,Unemployed_2005,Unemployment_rate_2005,Civilian_labor_force_2006,Employed_2006,Unemployed_2006,Unemployment_rate_2006,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,Civilian_labor_force_2008,Employed_2008,Unemployed_2008,Unemployment_rate_2008,Civilian_labor_force_2009,Employed_2009,Unemployed_2009,Unemployment_rate_2009,Civilian_labor_force_2010,Employed_2010,Unemployed_2010,Unemployment_rate_2010,Civilian_labor_force_2011,Employed_2011,Unemployed_2011,Unemployment_rate_2011,Civilian_labor_force_2012,Employed_2012,Unemployed_2012,Unemployment_rate_2012,Civilian_labor_force_2013,Employed_2013,Unemployed_2013,Unemployment_rate_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Civilian_labor_force_2020,Employed_2020,Unemployed_2020,Unemployment_rate_2020,Civilian_labor_force_2021,Employed_2021,Unemployed_2021,Unemployment_rate_2021,Civilian_labor_force_2022,Employed_2022,Unemployed_2022,Unemployment_rate_2022,Median_Household_Income_2021,Med_HH_Income_Percent_of_State_Total_2021
0,0,US,United States,,,,142601576,136904853,5696723,4.0,143786537,136977996,6808541,4.7,144839298,136455783,8383515,5.8,145660094,136944522,8715572,6.0,146724795,138613904,8110891,5.5,148597241,141000912,7596329,5.1,150707773,143729350,6978423,4.6,152191050,145156133,7034917,4.6,153761037,144860349,8900688,5.8,153825454,139594699,14230755,9.3,154254521,139393814,14860707,9.6,154520678,140688861,13831817,9.0,155038121,142527201,12510920,8.1,155362278,143905037,11457241,7.4,155936159,146318952,9617207,6.2,156840649,148554918,8285731,5.3,158674951,150949349,7725602,4.9,160744592,153744181,7000411,4.4,162039448,155727509,6311939,3.9,163815888,157805898,6009990,3.7,161483724,148453335,13030389,8.1,162229903,153544980,8684923,5.4,164781642,158766998,6014644,3.7,69717,
1,1000,AL,Alabama,,,,2147173,2047731,99442,4.6,2128027,2017467,110560,5.2,2112621,1987633,124988,5.9,2128668,2001568,127100,6.0,2138306,2018783,119523,5.6,2140356,2045234,95122,4.4,2170007,2083207,86800,4.0,2180448,2092030,88418,4.1,2176854,2049579,127275,5.8,2156593,1938784,217809,10.1,2197028,1968824,228204,10.4,2202337,1991379,210958,9.6,2178508,2000848,177660,8.2,2172102,2012828,159274,7.3,2164715,2018705,146010,6.7,2152295,2020443,131852,6.1,2155729,2029157,126572,5.9,2203458,2103873,99585,4.5,2240109,2152270,87839,3.9,2272935,2200437,72498,3.2,2269672,2124409,145263,6.4,2259349,2183330,76019,3.4,2286028,2226670,59358,2.6,53990,100.0
2,1001,AL,"Autauga County, AL",2.0,2.0,1.0,21861,20971,890,4.1,22081,21166,915,4.1,22161,21096,1065,4.8,22695,21557,1138,5.0,23241,22146,1095,4.7,23887,22986,901,3.8,24425,23619,806,3.3,24434,23610,824,3.4,24687,23376,1311,5.3,24660,22464,2196,8.9,25749,23481,2268,8.8,25845,23688,2157,8.3,25762,23932,1830,7.1,25783,24155,1628,6.3,25639,24150,1489,5.8,25541,24206,1335,5.2,25710,24395,1315,5.1,26269,25224,1045,4.0,26471,25515,956,3.6,26696,25927,769,2.9,26425,25023,1402,5.3,26545,25809,736,2.8,26789,26181,608,2.3,66444,123.1
3,1003,AL,"Baldwin County, AL",3.0,2.0,1.0,69979,67370,2609,3.7,69569,66545,3024,4.3,69379,65881,3498,5.0,72598,69010,3588,4.9,74843,71061,3782,5.1,76608,73581,3027,4.0,79806,77263,2543,3.2,82829,80213,2616,3.2,83205,79222,3983,4.8,82314,74950,7364,8.9,83551,75283,8268,9.9,85077,77459,7618,9.0,84507,77973,6534,7.7,85206,79466,5740,6.7,86546,81265,5281,6.1,87493,82619,4874,5.6,89778,84972,4806,5.4,93152,89286,3866,4.2,96235,92734,3501,3.6,98965,96069,2896,2.9,98979,92893,6086,6.1,99953,97034,2919,2.9,102849,100432,2417,2.4,65658,121.6
4,1005,AL,"Barbour County, AL",6.0,6.0,0.0,11449,10812,637,5.6,11324,10468,856,7.6,11006,10154,852,7.7,11019,10241,778,7.1,10639,9884,755,7.1,10730,10114,616,5.7,10713,10110,603,5.6,10363,9698,665,6.4,10175,9249,926,9.1,9944,8635,1309,13.2,10219,8978,1241,12.1,9843,8716,1127,11.4,9377,8273,1104,11.8,9096,8152,944,10.4,8859,7930,929,10.5,8590,7823,767,8.9,8334,7638,696,8.4,8415,7914,501,6.0,8505,8068,437,5.1,8641,8294,347,4.0,8684,8017,667,7.7,8280,7821,459,5.5,8241,7906,335,4.1,38649,71.6


In [96]:
unemployment_df.tail()

Unnamed: 0,FIPS_Code,State,Area_Name,Rural_Urban_Continuum_Code_2013,Urban_Influence_Code_2013,Metro_2013,Civilian_labor_force_2000,Employed_2000,Unemployed_2000,Unemployment_rate_2000,Civilian_labor_force_2001,Employed_2001,Unemployed_2001,Unemployment_rate_2001,Civilian_labor_force_2002,Employed_2002,Unemployed_2002,Unemployment_rate_2002,Civilian_labor_force_2003,Employed_2003,Unemployed_2003,Unemployment_rate_2003,Civilian_labor_force_2004,Employed_2004,Unemployed_2004,Unemployment_rate_2004,Civilian_labor_force_2005,Employed_2005,Unemployed_2005,Unemployment_rate_2005,Civilian_labor_force_2006,Employed_2006,Unemployed_2006,Unemployment_rate_2006,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,Civilian_labor_force_2008,Employed_2008,Unemployed_2008,Unemployment_rate_2008,Civilian_labor_force_2009,Employed_2009,Unemployed_2009,Unemployment_rate_2009,Civilian_labor_force_2010,Employed_2010,Unemployed_2010,Unemployment_rate_2010,Civilian_labor_force_2011,Employed_2011,Unemployed_2011,Unemployment_rate_2011,Civilian_labor_force_2012,Employed_2012,Unemployed_2012,Unemployment_rate_2012,Civilian_labor_force_2013,Employed_2013,Unemployed_2013,Unemployment_rate_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Civilian_labor_force_2020,Employed_2020,Unemployed_2020,Unemployment_rate_2020,Civilian_labor_force_2021,Employed_2021,Unemployed_2021,Unemployment_rate_2021,Civilian_labor_force_2022,Employed_2022,Unemployed_2022,Unemployment_rate_2022,Median_Household_Income_2021,Med_HH_Income_Percent_of_State_Total_2021
3272,72145,PR,"Vega Baja Municipio, PR",1.0,1.0,1.0,19805,17607,2198,11.1,19554,17138,2416,12.4,20219,17614,2605,12.9,20789,18031,2758,13.3,20607,18218,2389,11.6,20930,18296,2634,12.6,21654,19071,2583,11.9,21115,18445,2670,12.6,20372,17693,2679,13.2,19955,16433,3522,17.6,16228,12856,3372,20.8,15293,12430,2863,18.7,14796,12435,2361,16.0,14601,12198,2403,16.5,14157,11869,2288,16.2,13820,11921,1899,13.7,13733,11829,1904,13.9,13414,11746,1668,12.4,13159,11787,1372,10.4,13174,11912,1262,9.6,,,,,14172,12748,1424,10.0,14232,13180,1052,7.4,,
3273,72147,PR,"Vieques Municipio, PR",7.0,12.0,0.0,2441,2120,321,13.2,2323,1999,324,13.9,2625,2159,466,17.8,2664,2234,430,16.1,2554,2237,317,12.4,3062,2725,337,11.0,3300,2992,308,9.3,3317,2955,362,10.9,3214,2811,403,12.5,3124,2425,699,22.4,3401,2767,634,18.6,3174,2671,503,15.8,3043,2651,392,12.9,3080,2589,491,15.9,3152,2710,442,14.0,3194,2832,362,11.3,3260,2914,346,10.6,3027,2589,438,14.5,2726,2276,450,16.5,2551,2368,183,7.2,,,,,2804,2469,335,11.9,2742,2554,188,6.9,,
3274,72149,PR,"Villalba Municipio, PR",2.0,2.0,1.0,7636,6648,988,12.9,7564,6546,1018,13.5,8100,6820,1280,15.8,8393,7134,1259,15.0,8622,7522,1100,12.8,9506,8112,1394,14.7,9681,8417,1264,13.1,9479,8223,1256,13.3,9324,8036,1288,13.8,9518,7628,1890,19.9,8841,6892,1949,22.0,8485,6681,1804,21.3,8308,6584,1724,20.8,8184,6416,1768,21.6,7920,6196,1724,21.8,7802,6262,1540,19.7,7815,6235,1580,20.2,7557,6090,1467,19.4,7335,6174,1161,15.8,7494,6309,1185,15.8,,,,,7697,6796,901,11.7,7790,6982,808,10.4,,
3275,72151,PR,"Yabucoa Municipio, PR",1.0,1.0,1.0,10613,9005,1608,15.2,10751,8746,2005,18.6,11187,8961,2226,19.9,11393,9162,2231,19.6,11170,9244,1926,17.2,11247,9272,1975,17.6,11562,9647,1915,16.6,11425,9348,2077,18.2,10904,8931,1973,18.1,10816,8302,2514,23.2,11003,8210,2793,25.4,10630,7924,2706,25.5,10302,7903,2399,23.3,10169,7742,2427,23.9,9642,7546,2096,21.7,9198,7593,1605,17.4,9081,7545,1536,16.9,9000,7512,1488,16.5,8791,7534,1257,14.3,8841,7686,1155,13.1,,,,,9156,8216,940,10.3,9278,8504,774,8.3,,
3276,72153,PR,"Yauco Municipio, PR",2.0,2.0,1.0,14664,12871,1793,12.2,14416,12475,1941,13.5,15116,13006,2110,14.0,15161,12974,2187,14.4,15446,13493,1953,12.6,16695,14380,2315,13.9,16747,14658,2089,12.5,16635,14451,2184,13.1,16555,14394,2161,13.1,15826,13318,2508,15.8,12098,9514,2584,21.4,11760,9263,2497,21.2,11531,9140,2391,20.7,11350,8959,2391,21.1,11031,8665,2366,21.4,10748,8717,2031,18.9,10735,8719,2016,18.8,10160,8409,1751,17.2,9748,8315,1433,14.7,9851,8409,1442,14.6,,,,,10205,9009,1196,11.7,10353,9282,1071,10.3,,


In [97]:
unemployment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3277 entries, 0 to 3276
Data columns (total 100 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   FIPS_Code                                  3277 non-null   int64  
 1   State                                      3277 non-null   object 
 2   Area_Name                                  3277 non-null   object 
 3   Rural_Urban_Continuum_Code_2013            3219 non-null   float64
 4   Urban_Influence_Code_2013                  3219 non-null   float64
 5   Metro_2013                                 3224 non-null   float64
 6   Civilian_labor_force_2000                  3270 non-null   object 
 7   Employed_2000                              3270 non-null   object 
 8   Unemployed_2000                            3270 non-null   object 
 9   Unemployment_rate_2000                     3270 non-null   float64
 10  Civilian_labor_force_20

In [98]:
# The number of rows that contains NaN at least one
unemployment_df.isna().any(axis=1).sum()

149

In [99]:
unemployment_df.isna().sum()

FIPS_Code                                     0
State                                         0
Area_Name                                     0
Rural_Urban_Continuum_Code_2013              58
Urban_Influence_Code_2013                    58
Metro_2013                                   53
Civilian_labor_force_2000                     7
Employed_2000                                 7
Unemployed_2000                               7
Unemployment_rate_2000                        7
Civilian_labor_force_2001                     7
Employed_2001                                 7
Unemployed_2001                               7
Unemployment_rate_2001                        7
Civilian_labor_force_2002                     7
Employed_2002                                 7
Unemployed_2002                               7
Unemployment_rate_2002                        7
Civilian_labor_force_2003                     7
Employed_2003                                 7
Unemployed_2003                         

In [100]:
unemployment_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
FIPS_Code,3277.0,31318.964297,16332.539506,0.0,19019.0,30017.0,46099.0,72153.0
Rural_Urban_Continuum_Code_2013,3219.0,4.93849,2.724553,1.0,2.0,6.0,7.0,9.0
Urban_Influence_Code_2013,3219.0,5.189811,3.506942,1.0,2.0,5.0,8.0,12.0
Metro_2013,3224.0,0.382754,0.486135,0.0,0.0,0.0,1.0,1.0
Unemployment_rate_2000,3270.0,4.540581,2.042105,1.3,3.2,4.1,5.3,17.3
Unemployment_rate_2001,3270.0,5.233119,2.228461,1.6,3.8,4.8,6.1,18.6
Unemployment_rate_2002,3270.0,5.981193,2.34497,1.6,4.5,5.6,6.9,19.9
Unemployment_rate_2003,3270.0,6.226391,2.329511,1.9,4.7,5.8,7.3,20.2
Unemployment_rate_2004,3270.0,5.879235,2.114957,1.6,4.5,5.5,6.7,20.2
Unemployment_rate_2005,3263.0,5.671713,2.224239,2.0,4.2,5.2,6.5,21.0


In [101]:
unemployment_df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
State,3277,53,TX,255
Area_Name,3277,3276,District of Columbia,2
Civilian_labor_force_2000,3270,3136,2986,3
Employed_2000,3270,3128,3902,3
Unemployed_2000,3270,1749,57,10
Civilian_labor_force_2001,3270,3127,16954,3
Employed_2001,3270,3118,2447,3
Unemployed_2001,3270,1891,196,11
Civilian_labor_force_2002,3270,3141,5677,4
Employed_2002,3270,3123,4353,3


## **Data Cleanup and Preparation**

* Delete row whose 'State' is US --> delete 1 row
* Delete rows whose 'State' is Port Rico --> delete 79 rows

In [102]:
# Remove US, 1 row and Port Rico, 79 rows
unemployment_df = unemployment_df[(unemployment_df['State'] != "PR") & (unemployment_df['State'] != "US")].reset_index(drop=True)

* Delete unemployment-related columns before 2014

In [103]:
# Create a list of column names for 2013 and earlier
years_to_delete = [str(year) for year in range(2000, 2014)]
cols_to_delete = [col for col in unemployment_df.columns[6:].to_list() if col[-4:] in years_to_delete]

In [104]:
# Remove 56 unemployment-related columns for 2013 and earlier
unemployment_df.drop(cols_to_delete, axis=1, inplace=True)

In [105]:
# Check the shape of the df
unemployment_df.shape

(3197, 44)

* Remove ',' and convert from string to float data type

In [106]:
# Convert the datatype from object to float
object_columns = unemployment_df.select_dtypes(include=['object']).columns
# Remove 'State' and 'Area_Name'
object_columns = object_columns[2:]

In [107]:
# Remove "," to convert string to number
for col in object_columns:
    unemployment_df[col] = unemployment_df[col].str.replace(",", "")

unemployment_df[object_columns] = unemployment_df[object_columns].astype(float)

* Create a variable 'is_state' --> If 'Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013', 'Metro_2013' are all null, it's a state
* Create 'county' by removing " County" and " Parish", and the state abbrebiation from "Area_name": Ex. Autauga County, AL --> Autauga
* "Area_Name" in Alaska contains many area names ('Census Area', 'Borough', Borough/municipality, 'city/Census Area', 'city/Borough') --> Keep as is
* Delete 'Area_Name'

In [108]:
# Remove the state from Area_Name
county = []
for i in range(len(unemployment_df)):
    state = unemployment_df.loc[i, 'State']
    area = unemployment_df.loc[i, 'Area_Name'].split(f", {state}")[0]
    county.append(area)

In [109]:
# Create state flag
cols = ['Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013', 'Metro_2013']
is_state = unemployment_df[cols].isna().all(axis=1)
unemployment_df['is_state'] = is_state

# Extract only county
unemployment_df['county'] = county
unemployment_df['county'] = unemployment_df['county'].str.replace(" County", "")
unemployment_df['county'] = unemployment_df['county'].str.replace(" Parish", "")

# Drop 'Area_Name'
unemployment_df.drop(['Area_Name'], axis=1, inplace=True)

* Change the column order starts from 'State', 'county', and 'is_county'

In [110]:
# Chnage column order
cols2 = unemployment_df.columns
cols2 = cols2[2:-2] # from 'Rural_Urban_Continuum_Code_2013' to 'Med_HH_Income_Percent_of_State_Total_2021'
cols1 = ['FIPS_Code', 'State', 'county', 'is_state']
cols1.extend(cols2) 
unemployment_df = unemployment_df[cols1]

## **Missing Value Analysis**

* 51 state rows
* State rows all nulls in 'Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013', 'Metro_2013'

In [111]:
# Check null values at the state level
state_unemployment_df = unemployment_df[unemployment_df['is_state']]
num_state_rows = len(state_unemployment_df)
cols_w_missing_values = state_unemployment_df.columns[state_unemployment_df.isna().sum() > 0].values
print(f"THe number of state rows: {num_state_rows}")
print(f"Columns have null: {cols_w_missing_values}")
print()
print("These columns have all null values:")
print(state_unemployment_df[cols_w_missing_values].isna().all())
print()
# Verify there is no missing values in other columns
cols_wo_missing_values = list(set(state_unemployment_df.columns).difference(set(cols_w_missing_values)))
num_missing_values = state_unemployment_df[cols_wo_missing_values].isna().any().sum()
print(f"Number of null in other columns: {num_missing_values}")

THe number of state rows: 51
Columns have null: ['Rural_Urban_Continuum_Code_2013' 'Urban_Influence_Code_2013'
 'Metro_2013']

These columns have all null values:
Rural_Urban_Continuum_Code_2013    True
Urban_Influence_Code_2013          True
Metro_2013                         True
dtype: bool

Number of null in other columns: 0


* 'Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013', 'Metro_2013' --> -1.0

In [112]:
# Impute 'Rural_Urban_Continuum_Code_2013' and 'Urban_Influence_Code_2013'
cols_rural_urban_metro = ['Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013', 'Metro_2013']
idx = (unemployment_df['is_state']) & (unemployment_df.isna().sum(axis=1)>0)
unemployment_df.loc[idx, cols_rural_urban_metro] = -1.0

In [113]:
# Check null values at the State level
unemployment_df[unemployment_df['is_state']].isna().sum().sum()

0

In [114]:
# Check null at the county level
num_missing_rows = len(unemployment_df[~(unemployment_df['is_state']) & (unemployment_df.isna().sum(axis=1)>0)])
print(f"Number of missing rows: {num_missing_rows}")

unemployment_df[~(unemployment_df['is_state']) & (unemployment_df.isna().sum(axis=1)>0)]

Number of missing rows: 6


Unnamed: 0,FIPS_Code,State,county,is_state,Rural_Urban_Continuum_Code_2013,Urban_Influence_Code_2013,Metro_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Civilian_labor_force_2020,Employed_2020,Unemployed_2020,Unemployment_rate_2020,Civilian_labor_force_2021,Employed_2021,Unemployed_2021,Unemployment_rate_2021,Civilian_labor_force_2022,Employed_2022,Unemployed_2022,Unemployment_rate_2022,Median_Household_Income_2021,Med_HH_Income_Percent_of_State_Total_2021
74,2063,AK,Chugach Census Area,False,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,3256.0,2973.0,283.0,8.7,3179.0,3100.0,79.0,2.5,3387.0,3290.0,97.0,2.9,88029.0,112.2
75,2066,AK,Copper River Census Area,False,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1293.0,1154.0,139.0,10.8,1246.0,1205.0,41.0,3.3,1405.0,1275.0,130.0,9.3,61288.0,78.1
93,2201,AK,Prince of Wales-Outer Ketchikan Census Area,False,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
96,2232,AK,Skagway-Hoonah-Angoon Census Area,False,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
98,2261,AK,Valdez-Cordova Census Area,False,9.0,11.0,0.0,4865.0,4404.0,461.0,9.5,4864.0,4456.0,408.0,8.4,4802.0,4406.0,396.0,8.2,4870.0,4518.0,352.0,7.2,4863.0,4533.0,330.0,6.8,4871.0,4533.0,338.0,6.9,,,,,,,,,,,,,,
100,2280,AK,Wrangell-Petersburg Census Area,False,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [115]:
cols_rural_urban = ['Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013']
cols_unemployment = unemployment_df.columns[7:] # columns after 'Metro_2013'

* 'Rural_Urban_Continuum_Code_2013', 'Urban_Influence_Code_2013' --> -1.0

In [116]:
# Impute 'Rural_Urban_Continuum_Code_2013' and 'Urban_Influence_Code_2013'
idx = (unemployment_df['State'] == 'AK') & ~(unemployment_df['is_state']) & (unemployment_df[cols_rural_urban].isna().sum(axis=1)>0)
unemployment_df.loc[idx, cols_rural_urban] = -1.0

* Unemployment-related features will be imputed with 0

In [117]:
# Impute 'Rural_Urban_Continuum_Code_2013' and 'Urban_Influence_Code_2013'
idx = (unemployment_df['State'] == 'AK') & ~(unemployment_df['is_state']) & (unemployment_df[cols_unemployment].isna().sum(axis=1)>0)
unemployment_df.loc[idx, cols_unemployment] = 0

In [121]:
# Check if there is NaN
unemployment_df.isna().sum().sum()

0

## **Save the Dataframe as CSV**

In [91]:
file = os.path.join(processed_data_directory, 'cleaned_UnemploymentMedianIncome.csv')
unemployment_df.to_csv(file, index=False)

## **Verify cleaned_UnemploymentMedianIncome.csv**

In [92]:
file = os.path.join(processed_data_directory, 'cleaned_UnemploymentMedianIncome.csv')
unemployment_df = pd.read_csv(file)
unemployment_df.head()

Unnamed: 0,FIPS_Code,State,county,is_state,Rural_Urban_Continuum_Code_2013,Urban_Influence_Code_2013,Metro_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Civilian_labor_force_2020,Employed_2020,Unemployed_2020,Unemployment_rate_2020,Civilian_labor_force_2021,Employed_2021,Unemployed_2021,Unemployment_rate_2021,Civilian_labor_force_2022,Employed_2022,Unemployed_2022,Unemployment_rate_2022,Median_Household_Income_2021,Med_HH_Income_Percent_of_State_Total_2021
0,1000,AL,Alabama,True,-1.0,-1.0,-1.0,2164715.0,2018705.0,146010.0,6.7,2152295.0,2020443.0,131852.0,6.1,2155729.0,2029157.0,126572.0,5.9,2203458.0,2103873.0,99585.0,4.5,2240109.0,2152270.0,87839.0,3.9,2272935.0,2200437.0,72498.0,3.2,2269672.0,2124409.0,145263.0,6.4,2259349.0,2183330.0,76019.0,3.4,2286028.0,2226670.0,59358.0,2.6,53990.0,100.0
1,1001,AL,Autauga,False,2.0,2.0,1.0,25639.0,24150.0,1489.0,5.8,25541.0,24206.0,1335.0,5.2,25710.0,24395.0,1315.0,5.1,26269.0,25224.0,1045.0,4.0,26471.0,25515.0,956.0,3.6,26696.0,25927.0,769.0,2.9,26425.0,25023.0,1402.0,5.3,26545.0,25809.0,736.0,2.8,26789.0,26181.0,608.0,2.3,66444.0,123.1
2,1003,AL,Baldwin,False,3.0,2.0,1.0,86546.0,81265.0,5281.0,6.1,87493.0,82619.0,4874.0,5.6,89778.0,84972.0,4806.0,5.4,93152.0,89286.0,3866.0,4.2,96235.0,92734.0,3501.0,3.6,98965.0,96069.0,2896.0,2.9,98979.0,92893.0,6086.0,6.1,99953.0,97034.0,2919.0,2.9,102849.0,100432.0,2417.0,2.4,65658.0,121.6
3,1005,AL,Barbour,False,6.0,6.0,0.0,8859.0,7930.0,929.0,10.5,8590.0,7823.0,767.0,8.9,8334.0,7638.0,696.0,8.4,8415.0,7914.0,501.0,6.0,8505.0,8068.0,437.0,5.1,8641.0,8294.0,347.0,4.0,8684.0,8017.0,667.0,7.7,8280.0,7821.0,459.0,5.5,8241.0,7906.0,335.0,4.1,38649.0,71.6
4,1007,AL,Bibb,False,1.0,1.0,1.0,8573.0,7959.0,614.0,7.2,8552.0,7983.0,569.0,6.7,8539.0,7986.0,553.0,6.5,8648.0,8261.0,387.0,4.5,8697.0,8351.0,346.0,4.0,8777.0,8491.0,286.0,3.3,8717.0,8085.0,632.0,7.3,8641.0,8347.0,294.0,3.4,8726.0,8507.0,219.0,2.5,48454.0,89.7
