In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Using pandas to load the CSV file into a DataFrame.
# Inspecting the first few rows to understand the structure and content.
df = pd.read_csv('stop_tb_data2.csv')
print(df.head())



In [None]:
# Checking the number of missing values per column
missing_values_count = df.isnull().sum()

# Checking the percentage of missing values per column
missing_values_percentage = df.isnull().mean() * 100

# Print results
print(missing_values_count)
print(missing_values_percentage)


In [None]:
# Dropping columns with more than 50% missing values


df = df.dropna(thresh=len(df) * 0.5, axis=1)

# Filling missing values with median for numerical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())


In [None]:
# Checking the number of missing values per column again
missing_values_count = df.isnull().sum()

# Checking the percentage of missing values per column again
missing_values_percentage = df.isnull().mean() * 100

# Print results
print(missing_values_count)
print(missing_values_percentage)


In [None]:
# Display the first few rows of the dataframe
print(df.head())

# Display a summary of the dataframe to check data types and non-null counts
print(df.info())


In [None]:
# Drop non-numeric or irrelevant columns
df_filtered = df.drop(columns=['country_name', 'iso3_code', 'g_whoregion'])

# Convert remaining categorical columns to numeric using one-hot encoding
df_numeric = pd.get_dummies(df_filtered, drop_first=True)

# Calculate the correlation matrix
correlation_matrix = df_numeric.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# 'tb_incidence' is the column name for TB incidence
tb_incidence_corr = correlation_matrix['tb_incidence'].drop('tb_incidence')

# Sort correlations by absolute value to get the strongest correlations
sorted_corr = tb_incidence_corr.abs().sort_values(ascending=False)

# Display the top correlations (positive and negative)
print("Top Correlations with TB Incidence:")
print(sorted_corr.head(20))  # Top 20
print("\n")
print("Bottom Correlations with TB Incidence:")
print(sorted_corr.tail(10))  # Bottom 10
