In [280]:
import pandas as pd

In [281]:
# remove the first 8 rows.
cancer_data = pd.read_csv("cancer_data.csv", skiprows = 8)

In [282]:
# use set_option to display the whole dataframe cancer_data.
# reference: https://stackoverflow.com/questions/19124601/pretty-print-an-entire-pandas-series-dataframe
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', None)
cancer_data

Unnamed: 0,State,FIPS,Met Healthy People Objective of ***?,"Age-Adjusted Incidence Rate([rate note]) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend ([trend note]) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,US (SEER+NPCR)(1),0.0,***,448.6,448.3,448.9,,,,1703249.0,falling,-0.9,-1.1,-0.7
1,Kentucky(7),21000.0,***,517.8,515.0,520.6,1.0,1.0,1.0,27645.0,stable,-1,-1.9,0
2,New Jersey(7),34000.0,***,486.7,484.8,488.6,2.0,2.0,4.0,52514.0,falling,-0.6,-0.8,-0.5
3,Iowa(7),19000.0,***,484.1,480.9,487.3,3.0,2.0,9.0,18583.0,stable,-0.2,-0.3,0
4,West Virginia(6),54000.0,***,483.5,479.5,487.6,4.0,2.0,10.0,12028.0,falling,-0.3,-0.4,-0.1
5,New York(7),36000.0,***,483.1,481.8,484.3,5.0,3.0,8.0,114167.0,falling,-0.7,-1,-0.5
6,Louisiana(7),22000.0,***,482.4,479.7,485.1,6.0,2.0,10.0,25875.0,falling,-0.5,-0.9,-0.2
7,Pennsylvania(6),42000.0,***,480.0,478.5,481.5,7.0,5.0,11.0,79777.0,falling,-1.1,-1.5,-0.6
8,Delaware(6),10000.0,***,479.6,474.0,485.3,8.0,2.0,13.0,5893.0,falling,-2.7,-4.6,-0.9
9,New Hampshire(6),33000.0,***,479.3,474.6,484.1,9.0,3.0,13.0,8436.0,falling,-0.8,-1,-0.6


In [283]:
# use drop() to remove the last few rows that do not contain data.
# reference: https://moonbooks.org/Articles/How-to-remove-one-or-multiple-rows-in-a-pandas-DataFrame-in-python-/
cancer_data = cancer_data.drop(cancer_data.index[53:86])

In [284]:
# remove the number and brackets in the column State.
# reference: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column/22238380
cancer_data['State'] = cancer_data['State'].map(lambda x: x[:-3])

In [285]:
cancer_data = cancer_data.rename(columns = {" FIPS": "FIPS"})

In [286]:
# standardize the FIPS code.
# reference: https://stackoverflow.com/questions/54034755/how-to-edit-display-precision-for-only-one-dataframe-pandas
cancer_data['FIPS'] = (cancer_data['FIPS']/1000).map("{:,.0f}".format)

In [287]:
# reset the decimal precision of column average annual count.
cancer_data['Average Annual Count'] = cancer_data['Average Annual Count'].map(lambda x: '%.0f' % x)

In [288]:
cancer_data

Unnamed: 0,State,FIPS,Met Healthy People Objective of ***?,"Age-Adjusted Incidence Rate([rate note]) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend ([trend note]) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,US (SEER+NPCR),0,***,448.6,448.3,448.9,,,,1703249,falling,-0.9,-1.1,-0.7
1,Kentucky,21,***,517.8,515.0,520.6,1.0,1.0,1.0,27645,stable,-1,-1.9,0
2,New Jersey,34,***,486.7,484.8,488.6,2.0,2.0,4.0,52514,falling,-0.6,-0.8,-0.5
3,Iowa,19,***,484.1,480.9,487.3,3.0,2.0,9.0,18583,stable,-0.2,-0.3,0
4,West Virginia,54,***,483.5,479.5,487.6,4.0,2.0,10.0,12028,falling,-0.3,-0.4,-0.1
5,New York,36,***,483.1,481.8,484.3,5.0,3.0,8.0,114167,falling,-0.7,-1,-0.5
6,Louisiana,22,***,482.4,479.7,485.1,6.0,2.0,10.0,25875,falling,-0.5,-0.9,-0.2
7,Pennsylvania,42,***,480.0,478.5,481.5,7.0,5.0,11.0,79777,falling,-1.1,-1.5,-0.6
8,Delaware,10,***,479.6,474.0,485.3,8.0,2.0,13.0,5893,falling,-2.7,-4.6,-0.9
9,New Hampshire,33,***,479.3,474.6,484.1,9.0,3.0,13.0,8436,falling,-0.8,-1,-0.6


In [289]:
# save the cleaned cancer data into a CSV file cleaned_cancer_data.csv.
cancer_data.to_csv('cleaned_cancer_data.csv', index=False)

In [290]:
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.datasets import register_url

In [291]:
# integrate columns state, average annual count into list cancer_data_list. find the lowest and highest cases and save into variables low, high.
cancer_data0 = pd.read_csv("cleaned_cancer_data.csv")
cancer_data0 = cancer_data0[['State','Average Annual Count']]
cancer_data0 = cancer_data0.drop(cancer_data.index[0])
cancer_data_list = cancer_data0.dropna().values.tolist()
low, high = min([x[1] for x in cancer_data_list]), max([x[1] for x in cancer_data_list])

In [292]:
# reference: https://pyecharts.readthedocs.io/projects/pyecharts-en/en/stable/en-us/documentation.html
# use Pyecharts to draw the us map, save as html file map.html.
map = (Map()
    .add("", cancer_data_list, maptype = "美国", is_map_symbol_show = False)
    .set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    .set_global_opts(visualmap_opts = opts.VisualMapOpts(max_ = high, min_ = low),
                     title_opts = opts.TitleOpts(
                         title = "United State cancer cases")
                     )
)
map.render(path = "templates\map.html")

'd:\\yale\\Course\\BIS634\\homework\\homework5\\templates\\map.html'