In [1]:
from plotly.graph_objs import XAxis
import requests as rq
import bs4
import pandas as pd
from io import StringIO
import plotly.express as px

main_url= 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
main_page= rq.get(main_url, headers={"User-Agent": "Chrome"})
region_url= 'https://en.wikipedia.org/w/index.php?title=List_of_countries_by_GDP_(nominal)&oldid=1187446467'
region_page= rq.get(region_url, headers={"User-Agent": "Chrome"})
# read the web page into data

bs4mainpage = bs4.BeautifulSoup(main_page.text, 'html.parser')
tables = bs4mainpage.find_all('table',{'class':"wikitable"})
bs4regionpage = bs4.BeautifulSoup(region_page.text, 'html.parser')
r_tables = bs4regionpage.find_all('table')
#find the tables in the page, adding class and wikitable info

gdp = pd.read_html(StringIO(str(tables[0])))[0]
#Select correct table from site

gdp.columns = [str(col) for col in gdp.columns]
gdp = gdp.rename(columns={
    gdp.columns[0]: "Country",
    gdp.columns[1]: "IMF",
    gdp.columns[2]: "WorldBank",
    gdp.columns[3]: "UN"
})
#Column names into strings & Rename columns

gdp["Country"] = gdp["Country"].str.replace(r"\[.*\]", "",
                                            regex=True).str.strip()
#remove anything in [] and .strip: remove leading/trailing whitespace

for col in ["IMF", "WorldBank", "UN"]:
    gdp[col] = (
        gdp[col]
        .astype(str)
        .str.replace(",", "")
        .str.replace("—", "")
        .str.replace(r"\[.*\]", "", regex=True)
        .str.replace(r"\(.*\)", "", regex=True)
        .str.strip()
    )
    gdp[col] = pd.to_numeric(gdp[col], errors="coerce")
#goes through IMF, WorldBank, UN replaces any ,-[]() with ""
#convert values to int/float
#coerce changes any value that can't be converted to NaN

gdp = gdp[~gdp["Country"].isin(["World"])]
#Remove World info from Country in the table

region= pd.read_html(StringIO(str(r_tables[2])))[0]
#Select correct table from site

region.columns = [str(col) for col in region.columns]
region = region.rename(columns={
    region.columns[0]: "Country",
    region.columns[1]: "Region"
})
#Column names into strings & Rename columns

region = region[["Country", "Region"]]
region["Country"] = region["Country"].str.replace(r"\[.*\]", "",
                                                  regex=True).str.strip()
#keeps only columns country&region
#removes any [] from country

gdp["newIMF"] = gdp["IMF"]
gdp["newIMF"] = gdp["newIMF"].fillna(gdp["UN"])
gdp["newIMF"] = gdp["newIMF"].fillna(gdp["WorldBank"])
#if IMF has value goes into newIMF
#if IMF is na fills in with UN or WorldBank

gdp["newIMF"] = pd.to_numeric(gdp["newIMF"], errors="coerce")
gdp = pd.merge(gdp, region, on="Country", how="left")
gdp = gdp.dropna(subset=["newIMF", "Region"])
#convert values to int/float
#merges region into gdp
#drops any row with NaN in newIMF or Region

newGDP = gdp[['Region', 'Country', 'newIMF']].copy()
newGDP = newGDP.rename(columns={"newIMF": "IMF"})
#New dataframe with region, country, IMF

fig = px.bar(newGDP, x = "Region", y = "IMF", color = "Country",
             title="GDP by Country", barmode='stack')
fig.show()
#Graphing

fig.to_html("stacked_bar.html")
fig.write_html("stacked_bar.html")

In [2]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go

# load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
    "modify"   : "roi",
    "modify.1" : "level4",
    "modify.2" : "level3",
    "modify.3" : "level2",
    "modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3',
                                       'level2', 'level1']]

## Loads in the subject data
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5)
& (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
# Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume /
                                 np.sum(subjectData.volume))

#Labels for Sankey
levels= ['icv','level1','level2','level3','level4','roi']

#for each level converts to string, naming value with level#_data
for lvl in levels:
    subjectData[lvl] = lvl + "_" + subjectData[lvl].astype(str)

#find all unique node names, flattens into 1D array
#maps each node to integer index -> makes into list
connect_nodes = pd.unique(subjectData[levels].values.ravel())
node_dict = {name: i for i, name in enumerate(connect_nodes)}
labels = list(connect_nodes)

#Source, target & value for Sankey
edge=[]

#loops for consecutive pairing
#groups source and target together aggreates flow value in comp
#create source and target columns & convert name to int index
#appends to only three columns
for i in range(len(levels)-1):
  sources=levels[i]
  targets=levels[i+1]
  df = subjectData.groupby([sources, targets])['comp'].sum().reset_index()
  df['source'] = [node_dict[name] for name in df[sources]]
  df['target'] = [node_dict[name] for name in df[targets]]
  edge.append(df[['source','target','comp']])

#combines into new dataframe, updating types
main_df= pd.concat(edge,ignore_index=True)
main_df['source'] = main_df['source'].astype(int)
main_df['target'] = main_df['target'].astype(int)
main_df['comp'] = main_df['comp'].astype(float)

#Graphing Sankey
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = main_df['source'],
      target = main_df['target'],
      value = main_df['comp']
  ))])

fig.update_layout(title_text="MRICloud data", font_size=10)
fig.show()

fig.to_html("sankey.html")
fig.write_html("sankey.html")