In [1]:
# Dependencies
import pandas as pd


In [2]:
# Store file path in variable
prison_file = "Resources/covid_prison_cases.csv"
states_file ="Resources/us-states.csv"
covid_tracking_file = "Resources/3_covid_tracking_project_historical_testing_numbers_and_covid_deaths_by_state.csv"

In [3]:
# Read Data files with the pandas library
prison_df = pd.read_csv(prison_file, encoding="UTF-8")
states_df = pd.read_csv(states_file, encoding="UTF-8")
covid_tracking_df = pd.read_csv(covid_tracking_file, encoding="UTF-8")


# Prison_df

In [4]:
# Print out Column Titles
prison_df.columns

Index(['name', 'abbreviation', 'staff_tests', 'staff_tests_with_multiples',
       'total_staff_cases', 'staff_recovered', 'total_staff_deaths',
       'prisoner_tests', 'prisoner_tests_with_multiples',
       'total_prisoner_cases', 'prisoners_recovered', 'total_prisoner_deaths',
       'as_of_date', 'notes'],
      dtype='object')

In [5]:
prison_df.head()

Unnamed: 0,name,abbreviation,staff_tests,staff_tests_with_multiples,total_staff_cases,staff_recovered,total_staff_deaths,prisoner_tests,prisoner_tests_with_multiples,total_prisoner_cases,prisoners_recovered,total_prisoner_deaths,as_of_date,notes
0,Alabama,AL,,,,,,,,,,,03/16/2021,We are told vaccinations have begun for staff ...
1,Alaska,AK,,,303.0,,0.0,,27029.0,2382.0,,5.0,03/18/2021,Vaccine numbers as of 3/16.
2,Arizona,AZ,,,2714.0,2676.0,,43632.0,,12176.0,11907.0,51.0,03/16/2021,We have been told vaccinations have begun in t...
3,Arkansas,AR,,,,,4.0,,,11172.0,11122.0,52.0,03/16/2021,
4,California,CA,,,15970.0,15396.0,26.0,119887.0,,49176.0,48299.0,216.0,03/16/2021,Vaccine numbers as of 3/14


In [6]:
# rename the variable
prison_df['date'] = pd.to_datetime(prison_df['as_of_date'])

In [7]:
# Remove columns not being used for analysis
prison_df.drop(columns =['abbreviation','staff_tests','staff_recovered','prisoners_recovered','staff_tests_with_multiples','prisoner_tests','prisoner_tests_with_multiples','notes','as_of_date'], inplace = True)


In [8]:
print(prison_df.duplicated())

0       False
1       False
2       False
3       False
4       False
        ...  
2647    False
2648    False
2649    False
2650    False
2651    False
Length: 2652, dtype: bool


In [9]:
prison_df.drop_duplicates(inplace = True)

In [10]:
prison_df

Unnamed: 0,name,total_staff_cases,total_staff_deaths,total_prisoner_cases,total_prisoner_deaths,date
0,Alabama,,,,,2021-03-16
1,Alaska,303.0,0.0,2382.0,5.0,2021-03-18
2,Arizona,2714.0,,12176.0,51.0,2021-03-16
3,Arkansas,,4.0,11172.0,52.0,2021-03-16
4,California,15970.0,26.0,49176.0,216.0,2021-03-16
...,...,...,...,...,...,...
2647,Washington,4.0,0.0,0.0,0.0,2020-03-26
2648,West Virginia,0.0,0.0,0.0,0.0,2020-03-26
2649,Wisconsin,5.0,0.0,0.0,0.0,2020-03-26
2650,Wyoming,0.0,0.0,0.0,0.0,2020-03-26


# States_df

In [11]:
# Print out Column Titles
states_df.columns

Index(['date', 'state', 'fips', 'cases', 'deaths'], dtype='object')

In [12]:
# Drop all rows with 'NaN'
states_df = states_df.dropna()
states_df.reset_index(inplace = True, drop = True)


In [13]:
states_df.head()

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


In [14]:
print(states_df.duplicated())

0        False
1        False
2        False
3        False
4        False
         ...  
21129    False
21130    False
21131    False
21132    False
21133    False
Length: 21134, dtype: bool


In [15]:
states_df.drop_duplicates(inplace = True)

# covid_tracking_df

In [16]:
# Print out Column Titles
covid_tracking_df.columns

Index(['state', 'date', 'total_population', 'cumulative_total_test_results',
       'cumulative_total_test_results_per_1000', 'total_test_results_increase',
       'cumulative_positive_tests', 'positive_tests_increase',
       'cumulative_negative_tests', 'cumulative_positivity_rate',
       'cumulative_deaths', 'cumulative_deaths_per_100_000', 'deaths_increase',
       'last_update_et', 'total_test_results_increase_7_day_rolling_avg',
       'positive_tests_increase_7_day_rolling_avg',
       'positivity_rate_7_day_rolling_avg'],
      dtype='object')

In [17]:
# Drop all rows with 'NaN'
covid_tracking_df = covid_tracking_df.dropna()
covid_tracking_df.reset_index(inplace = True, drop = True)

In [18]:
# Remove columns not being used for analysis
covid_tracking_df.drop(columns =['cumulative_total_test_results','cumulative_total_test_results_per_1000','cumulative_positive_tests',
                                'cumulative_negative_tests','cumulative_positivity_rate','cumulative_deaths','cumulative_deaths_per_100_000',
                                'last_update_et','total_test_results_increase_7_day_rolling_avg','positive_tests_increase_7_day_rolling_avg',
                                'positivity_rate_7_day_rolling_avg'], inplace = True)
covid_tracking_df.head()

Unnamed: 0,state,date,total_population,total_test_results_increase,positive_tests_increase,deaths_increase
0,Alabama,2020-03-15 00:00:00,4887871,12,6,0
1,Alabama,2020-03-16 00:00:00,4887871,16,16,0
2,Alabama,2020-03-17 00:00:00,4887871,8,8,0
3,Alabama,2020-03-18 00:00:00,4887871,10,10,0
4,Alabama,2020-03-19 00:00:00,4887871,22,22,0


In [19]:
covid_tracking_df['date'] = pd.to_datetime(covid_tracking_df['date'])

In [20]:
covid_tracking_df.head()

Unnamed: 0,state,date,total_population,total_test_results_increase,positive_tests_increase,deaths_increase
0,Alabama,2020-03-15,4887871,12,6,0
1,Alabama,2020-03-16,4887871,16,16,0
2,Alabama,2020-03-17,4887871,8,8,0
3,Alabama,2020-03-18,4887871,10,10,0
4,Alabama,2020-03-19,4887871,22,22,0


In [21]:
print(covid_tracking_df.duplicated())

0        False
1        False
2        False
3        False
4        False
         ...  
11062    False
11063    False
11064    False
11065    False
11066    False
Length: 11067, dtype: bool


In [22]:
covid_tracking_df.drop_duplicates(inplace = True)

In [23]:
covid_tracking_df

Unnamed: 0,state,date,total_population,total_test_results_increase,positive_tests_increase,deaths_increase
0,Alabama,2020-03-15,4887871,12,6,0
1,Alabama,2020-03-16,4887871,16,16,0
2,Alabama,2020-03-17,4887871,8,8,0
3,Alabama,2020-03-18,4887871,10,10,0
4,Alabama,2020-03-19,4887871,22,22,0
...,...,...,...,...,...,...
11062,Wyoming,2021-03-03,577737,2096,89,0
11063,Wyoming,2021-03-04,577737,2703,69,0
11064,Wyoming,2021-03-05,577737,3114,79,0
11065,Wyoming,2021-03-06,577737,0,0,0
