In [None]:
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt


In [None]:
# Import CSV files 
TB_Rates = Path('Resources/TB_Rates.csv')
HDI = Path('Resources/HDI.csv')
Drug_Resistant_Rates = Path('Resources/Drug_Resistant_Rates.csv')

# Create Data Frames 
tb_rates_df = pd.read_csv(TB_Rates)
hdi_df = pd.read_csv(HDI)
drug_resistant_df = pd.read_csv(Drug_Resistant_Rates)

In [None]:
# Clean tb_rates.df
tb_rates_df.head()

In [None]:
hdi_df.head()

In [None]:
drug_resistant_df.head()

In [None]:
#RENAME HDI DATAFRAME SO COUNTRY SERIES MATCHES TB_RATES
hdi_df = hdi_df.rename(columns={'Country_Name': 'country'}) 

hdi_df.head()

In [None]:
#MERGE HDI AND TB_RATES DATAFRAMES

# Merge HDI DataFrame and TB_DataFrame (Infection and Mortality) on Country_Name
hdi_mortality_df = pd.merge(hdi_df, tb_rates_df, on="country", how="inner")


# Display DataFrame
hdi_mortality_df.head()


In [None]:
#Column meanings = e_mort_100k = Estimated mortality of TB cases (all forms) per 100 000 population
#Column meanings = e_inc_100k = Estimated incidence (all forms) per 100 000 population
updated_hdi_mortality_df = hdi_mortality_df[['country', 'Human Development Index (HDI) (Value)', 'year', 'e_inc_100k', 'e_mort_100k']]

updated_hdi_mortality_df = updated_hdi_mortality_df.rename(columns={'country': 'Country', 'year': 'Year', 
                                                                    'Human Development Index (HDI) (Value)': 'HDI', 'e_inc_100k': 'Infection Rate per 100k', 
                                                                    'e_mort_100k': 'Mortailty Rate per 100k'})

updated_hdi_mortality_df.head()



In [None]:
#Retrieve only year 2021 data
only_year2021_df = updated_hdi_mortality_df.loc[(updated_hdi_mortality_df['Year'] == 2021), :]
only_year2021_df

In [None]:
#BIN DATA BY HDI CLASSIFICATION

#Group the countries by HDI into the following parameters based on widely accepted HDI classifications:
# >.800 is classified = Very High, < 0.700 > 0.799 = high, > 0.550 < 0.699 = medium, and < 0.550 = low

# Create bins where HDI values are stored
bins = [0, .550, .699, .799, 1]

# Create the names for four bins in which HDI data will be stored
hdi_classificatons = ["Low", "Medium", "High", "Very High"]

# Slice the data and place it into bins 
only_year2021_df["HDI Classification"] = pd.cut(only_year2021_df["HDI"], bins, labels=hdi_classificatons, include_lowest=True)

# Display new DataFrame with added HDI classifications
only_year2021_df

In [None]:
only_year2021_df['Infection Rate per 100k'].describe()

In [None]:
only_year2021_df['Mortailty Rate per 100k'].describe()

In [None]:
# BOX AND WHISKER PLOT (INFECTION RATE)

#Graph box and whisker plot 
boxplot_df = only_year2021_df.boxplot(by='HDI Classification', column='Infection Rate per 100k', figsize=(10,10))
boxplot_df.set_title('TB Incidence Rate by HDI Classification')
boxplot_df.set_ylabel('TB Incidence Rate per 100k Population')
plt.show()

In [None]:
# BOX AND WHISKER PLOT (MORTALITY RATE)

#Graph box and whisker plot 
boxplot_df = only_year2021_df.boxplot(by='HDI Classification', column='Mortailty Rate per 100k', figsize=(10,10))
boxplot_df.set_title('Mortality Rate by HDI Classification')
boxplot_df.set_ylabel('Mortality Rate per 100k Population')
plt.show()

In [None]:
#SCATTER PLOT

#Graph Scatter Plot
x_values = only_year2021_df['HDI']
y_values = only_year2021_df['Infection Rate per 100k']
plt.figure(figsize=(30,10))
plt.title('TB Incidence Rate per 100k Population by HDI')
plt.xticks(rotation = 55)
plt.xlabel('HDI')
plt.ylabel('TB Incidence Rate per 100k Population')
plt.scatter(x_values, y_values)
plt.show()



In [None]:
#LINEAR REGRESSION

#Calculate and Plot linear regression
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(x_values, y_values)
        
#Get Regression Values
regress_values = x_values * slope + intercept

In [None]:
#LINE EQUATION STRING

#Create line equation string
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

line_eq

In [None]:
#SCATTER PLOT WITH LINEAR REGRESSION TEXT

#Create Scatter Plot

x_values = only_year2021_df['HDI']
y_values = only_year2021_df['Infection Rate per 100k']
plt.figure(figsize=(30,10))
plt.title('TB Incidence Rate per 100k Population by HDI')
plt.xticks(rotation = 55)
plt.xlabel('HDI')
plt.ylabel('TB Incidence Rate per 100k Population')
plt.scatter(x_values, y_values)
plt.annotate(line_eq, 
            xy=(0, intercept),
            xycoords='axes points',
            
            annotation_clip=False, 
            fontsize=15, 
            color='red')
plt.plot(x_values, regress_values, 'r-')



In [None]:
#Print R Value

print(f'The R-Value is: {rvalue**2}')
