# Add regions to the congress dataframe

In [21]:
"""
Author: Yixian Zhou, ...
Date: Dec. 6
Description: 
Add regions to the DataFrame of Congress_YCOM_2019_Data.csv
Also some basic exploration
References: # source: https://www.nationalgeographic.org/maps/united-states-regions/
"""

'\nAuthor: Yixian Zhou, ...\nDate: Dec. 6\nDescription: \nAdd regions to the DataFrame of Congress_YCOM_2019_Data.csv\nAlso some basic exploration\n'

## Set up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
pd.set_option('display.max_columns', None)

In [2]:
is_modify = False

In [3]:
def print_with_format(l, num_ea_line = 5):
    l = list(l)
    for i in range(len(l) // num_ea_line):
        start = i * num_ea_line
        print(", ".join(l[start:start + num_ea_line]))
    if len(l) % num_ea_line != 0: ## there are some left
        print(", ".join(l[i * num_ea_line:]))      

In [4]:
## including DC
with open("state_abbr2full.json", "r") as json_file:
    abbr2full = json.load(json_file) 

with open("state_full2abbr.json", "r") as json_file:
    full2abbr = json.load(json_file)

with open("region_states.json", "r") as file:
    region_state = json.load(file)

In [7]:
if is_modify:
    ## modify the original dictionary because of debugging or change in reference
    state2region = {}
    for region, states in region_state.items():
        for this_state in states:
            state2region[this_state] = region


    with open("state2region.json", "w") as file:
        json.dump(state2region, file, indent=2)
else:
    ## we usually run this block
    with open("state2region.json", "r") as file:
        state2region = json.load(file)
    
    

In [5]:
congress_df = pd.read_csv("./dataset/Congress_YCOM_2019_Data.csv")
climate_2016 = pd.read_csv("./dataset/climate_dataset_cleaned_v1.0.csv")
# climate_2019 = pd.read_excel("./dataset/YCOM_2019_Data.xlsx")

In [6]:
event_df = pd.read_csv("./dataset/districteventList.csv")

In [12]:
event_df_clean = event_df[event_df['state'].isin(list(state2region.keys()))].copy()

In [8]:
ycom_data = []
with open("./dataset/YCOM_2019_Data.csv", "r") as csv_file:
    reader = csv.reader(csv_file)
    cols = reader.__next__()
    i = 0
    for row in reader:
        if i == 1845:
            print(row)
#         ycom_data.append(row)
        i += 1

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 3612: invalid continuation byte

In [8]:
## get the columns they have in common as well as those uniuqe ones
common_cols = []
cong_only_cols = []
clim_only_cols = list(climate_2016.columns)
for col in congress_df.columns:
    if col in climate_2016.columns:
        common_cols.append(col)
        clim_only_cols.remove(col)
    else:
        cong_only_cols.append(col)
        

In [None]:
print_with_format(cong_only_cols)

## Manipulation

In [61]:
congress_df['state_abbr'] = congress_df['state_label'].apply(lambda x: full2abbr[x])
congress_df['region'] = congress_df['state_abbr'].apply(lambda x: state2region[x])

In [13]:
event_df_clean['region'] = event_df_clean['state'].apply(lambda x: state2region[x])

In [14]:
event_df_clean['state'].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [83]:
event_df[event_df['state'] == 'PR']

Unnamed: 0.1,Unnamed: 0,state,STFIPS,districtId,event,vtec,published,updated
419347,419348,PR,72,7200,Coastal Flood,CF,2019-01-23 23:56:00,2019-01-23 23:56:00
419348,419349,PR,72,7200,Coastal Flood,CF,2019-01-24 00:42:00,2019-01-24 00:42:00
419349,419350,PR,72,7200,Coastal Flood,CF,2019-01-24 05:21:00,2019-01-24 05:21:00
419350,419351,PR,72,7200,Coastal Flood,CF,2019-01-24 11:39:00,2019-01-24 11:39:00
419351,419352,PR,72,7200,Coastal Flood,CF,2019-08-27 18:54:00,2019-08-27 18:54:00
...,...,...,...,...,...,...,...,...
420624,420625,PR,72,7200,Wind Advisory,WI,2018-12-29 11:39:00,2018-12-29 11:39:00
420625,420626,PR,72,7200,Wind Advisory,WI,2018-12-29 13:35:00,2018-12-29 13:35:00
420626,420627,PR,72,7200,Wind Advisory,WI,2018-12-29 15:54:00,2018-12-29 15:54:00
420627,420628,PR,72,7200,Wind Advisory,WI,2018-12-30 00:26:00,2018-12-30 00:26:00


# Write to file

In [64]:
congress_df.to_csv("./dataset/Congress_YCOM_2019_Data_abbr_region.csv", index=False)

In [15]:
event_df_clean.to_csv("./dataset/districteventList_region_clean.csv")

# Notes:
* It uses integers as *state_code*.
* *state_label* is full names of states
* there's no DC in the *state_label* while there is in the 2016 dataset.