In [5]:
import rpy2
import rpy2.robjects as ro
import pandas as pd
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
base = importr('base')
utils = importr('utils')


pandas2ri.activate()

df = pd.read_csv('cancer_reg.csv', encoding='latin-1')

cancer = pandas2ri.py2rpy(df)

In [7]:
#Data PreProcessing for Visualization 1

testpropo = cancer
testpropo <- testpropo %>% mutate(Target_div_Income = TARGET_deathRate/medIncome)
testpropo1 = cbind(testpropo, str_match(testpropo$Geography,"(.+), (.+)")[ ,-1])
colnames(testpropo1)[37] ="State"
colnames(testpropo1)[36] = "County"
testpropo1[167,36] <- "Dona Ana County"
testpropo1[821,36] <- "La Salle Parish"
codes <- rep(NULL, length(testpropo1$County))

for (i in 1:length(testpropo1$avgAnnCount)){
 codes[i] = fips(state = testpropo1$State[i], county = testpropo1$County[i])
}

testpropo2 = cbind(testpropo1, fips = codes)
testpropo3 <- testpropo2 %>% mutate(Target_div_LogIncome = TARGET_deathRate/log(medIncome))
testpropolog = cbind(testpropo3, fips = codes)
graphdatalog = data.frame(fips = testpropolog$fips, values = testpropo3$Target_div_LogIncome)
newbieLOG <- graphdata %>% mutate(anomalies = ifelse(abs(scale(values)) > 1, values, 0))
newbieLOG <- newbieLOG[,c(1,3)]

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,...,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
0,1397.000000,469,164.9,489.800000,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.000000,70,161.3,411.600000,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.372500,4.333096
2,102.000000,50,174.7,349.700000,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.922190,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.000000,202,194.8,430.400000,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.000000,26,144.4,350.100000,49955,10321,12.5,0.000000,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.665830,0.492135,54.027460,6.796657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,1962.667684,15,149.6,453.549422,46961,6343,12.4,0.000000,"(45201, 48021.6]",44.2,...,54.9,44.6,31.7,13.2,90.280811,3.837754,0.327613,1.700468,51.063830,7.773512
3043,1962.667684,43,150.1,453.549422,48609,37118,18.8,377.175494,"(48021.6, 51046.4]",30.4,...,53.3,48.6,28.8,17.7,75.706245,2.326771,4.044920,14.130288,52.007937,8.186470
3044,1962.667684,46,153.9,453.549422,51144,34536,15.0,1968.959926,"(51046.4, 54545.6]",30.9,...,52.6,47.8,26.6,16.8,87.961629,2.313188,1.316472,5.680705,55.153949,7.809192
3045,1962.667684,52,175.0,453.549422,50745,25609,13.3,0.000000,"(48021.6, 51046.4]",39.0,...,56.3,49.6,29.5,14.0,92.905681,1.176562,0.244632,2.131790,58.484232,7.582938


In [None]:
#Plotting !! Visualization 1
plot_usmap(data = newbieLOG, values = "anomalies") + 
  scale_fill_stepsn(breaks= -4:4, limits = c(-4,4),
                    colors=c("white","white", "white","red","dark red"),
                    guide = guide_colorsteps(even.steps = FALSE), name = " Anomalies") +  
  theme(panel.background = element_rect(color = "black")) + 
  theme(legend.position = "left") + labs(title = "Cancer Deaths to Median Income Anomalies",
       subtitle = "Anomalies are  standard deviations away from the mean of the ratio between \nCancer Deaths (per capita) to Median Income (on a log scale) for each \nU.S. County. \nAnomalies less than |1| are replaced with 0 for clarity.
       \nAnomalies larger than 2 represent counties with a high ratio, implying \nhigh cancer mortality and low income.")

In [8]:
#Visualization 2 !! Also in R
df_race <- reshape2::melt(cancer[, c("target_deathrate", "medincome", "pctwhite", "pctblack", "pctasian", "pctotherrace")], 
                          id.vars = c("target_deathrate", "medincome"))

df_race$variable_group <- ifelse(df_race$variable == "pctwhite", "White", 
                                 ifelse(df_race$variable == "pctblack", "Black", 
                                        ifelse(df_race$variable == "pctasian", "Asian", "Other")))
## Consistent color
ggplot(df_race, aes(x = value, y = target_deathrate, color = variable_group, size = medincome)) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("red", "blue", "green", "purple")) +
  scale_size_continuous(limits = c(25000, 75000)) +
  xlab("Percentage of population by race") +
  ylab("Target death rate") +
  ggtitle("Impact of race and income on target death rate") +
  facet_wrap(~variable, scales = "free_y") +
  theme_minimal()

SyntaxError: invalid syntax (1074441614.py, line 2)