In [None]:
import pandas as pd

df = pd.read_csv('Homicides.csv')
df.drop(['Region', 'Subregion', 'Dimension', 'Category', 'Year', 'Unit of measurement', 'Source'], axis=1, inplace=True)
df.head()

country = df.groupby(by='Country')['VALUE'].sum()
country

In [None]:
# Religion
import numpy as np

religion = pd.read_csv('religion.csv')
religion.drop(['Region', 'Level', 'Countrycode'], axis=1, inplace=True)
religion.query('Year == 2020', inplace=True)

religion['Population'] = (
    religion['Population']
    .astype(str)
    .str.replace(',', '', regex=False)         
)
religion['Religiously_unaffiliated'] = (
    religion['Religiously_unaffiliated']
    .astype(str)
    .str.replace(',', '', regex=False)
)

religion['Religion Density'] = 1 - (religion['Religiously_unaffiliated'].astype(int) / religion['Population'].astype(int))
religion

In [None]:
# Sex
sex = pd.read_csv('Sex.csv')
sex.drop(['Iso3_code', 'Region', 'Subregion', 'Indicator', 'Dimension', 'Category'], axis=1, inplace=True)
sex = sex.groupby(by='Country')['VALUE'].sum()
sex

In [None]:
# Corruption 
corruption = pd.read_csv('Corruption.csv')
corruption.query('`Unit of measurement` == "Counts"', inplace=True)
corruption = corruption.groupby(by='Country')['VALUE'].sum()
corruption

In [None]:
# Merging
merged_df = pd.merge(religion, country, how='inner', on=['Country'])
merged_df = pd.merge(merged_df, sex, how='inner', on=['Country'], suffixes=('_hom', '_sex'))
merged_df = pd.merge(merged_df, corruption, how='inner', on=['Country'])
merged_df['Homicide Density'] = merged_df['VALUE_hom'].astype(int) / merged_df['Population'].astype(int) * 100
merged_df['Sex Assault Density'] = merged_df['VALUE_sex'].astype(int) / merged_df['Population'].astype(int) * 100
merged_df = merged_df.rename(columns={'VALUE': 'VALUES_corr'})
merged_df['Corruption Density'] = merged_df['VALUES_corr'].astype(int) / merged_df['Population'].astype(int) * 100
# merged_df.sort_values(by='Homicide Density', ascending=False)
merged_df.head()


In [None]:
import pandas as pd

relig = pd.read_csv("religion.csv")
corr = pd.read_csv("Corruption.csv")

print("Religion columns:", relig.columns.tolist())
print("Corruption columns:", corr.columns.tolist())

In [None]:
print(df.columns)

In [None]:
#Corruption vs religion plot
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# =======================
# Load data
# =======================
df = pd.read_csv("final_1.csv")


# =======================
# Clean numeric columns
# =======================
cols_to_clean = [
    'Christians', 'Muslims', 'Buddhists', 'Hindus', 'Jews',
    'Other_religions', 'Religiously_unaffiliated',
    'Population', 'corruption_density'   # <-- Replace VALUE_hom
]

for col in cols_to_clean:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')

# =======================
# Filter & compute new columns
# =======================
#df_filtered = df[df['Population'] > 2000000].copy()
df_filtered = df.copy()
df_filtered['Non_Religious_Count'] = df_filtered['Religiously_unaffiliated']
df_filtered['Religious_Count'] = df_filtered['Population'] - df_filtered['Non_Religious_Count']

# =======================
# Sort countries by population
# =======================
df_sorted = df_filtered.sort_values(by='Population', ascending=True)

# =======================
# Create figure
# =======================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, len(df_sorted) * 0.5), sharey=True)

# =======================
# LEFT SIDE: Corruption Density
# =======================
ax1.barh(
    df_sorted['Country'],
    df_sorted['Corruption Density'],
    color='#FF0000',
    edgecolor='black',
    linewidth=0.5
)

ax1.set_xlim(max(df_sorted['Corruption Density']) * 1.1, 0)
ax1.set_xlabel('Corruption Density')
ax1.set_title('Corruption Density', fontsize=14, fontweight='bold', color='#FF0000')
ax1.grid(axis='x', linestyle='--', alpha=0.5)

# =======================
# RIGHT SIDE: Religious vs Non-Religious (Stacked)
# =======================

p1 = ax2.barh(
    df_sorted['Country'],
    df_sorted['Religious_Count'],
    color='#87CEFA',
    label='Religious',
    edgecolor='black',
    linewidth=0.5
)

p2 = ax2.barh(
    df_sorted['Country'],
    df_sorted['Non_Religious_Count'],
    left=df_sorted['Religious_Count'],
    color='#E6F3FF',
    label='Non-Religious',
    edgecolor='black',
    linewidth=0.5
)

ax2.set_xlabel('Population')
ax2.set_title('Population Distribution', fontsize=14, fontweight='bold', color='#2e86de')
ax2.legend()
ax2.grid(axis='x', linestyle='--', alpha=0.5)
ax2.ticklabel_format(style='plain', axis='x')

plt.subplots_adjust(wspace=0.0)

plt.suptitle(
    'Comparison: Corruption Density vs. Religious Composition',
    fontsize=16,
    y=1.005
)

plt.tight_layout()
plt.show()


In [None]:
#Sex Assault vs religion plot
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# ==========================================
# Load data
# ==========================================
df = pd.read_csv("final_1.csv")

# ==========================================
# Clean numeric columns
# ==========================================
cols_to_clean = [
    'Christians', 'Muslims', 'Buddhists', 'Hindus', 'Jews',
    'Other_religions', 'Religiously_unaffiliated',
    'Population', 'VALUE_sex_assault'  # <-- NEW VARIABLE HERE
]

for col in cols_to_clean:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')

# ==========================================
# Filter large countries
# ==========================================
#df_filtered = df[df['Population'] > 2000000].copy()
df_filtered = df.copy()
# ==========================================
# Religious composition
# ==========================================
df_filtered['Non_Religious_Count'] = df_filtered['Religiously_unaffiliated']
df_filtered['Religious_Count'] = df_filtered['Population'] - df_filtered['Non_Religious_Count']

# ==========================================
# Sort
# ==========================================
df_sorted = df_filtered.sort_values(by='Population', ascending=True)

# ==========================================
# Create figure layout
# ==========================================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, len(df_sorted) * 0.5), sharey=True)

# ==========================================
# LEFT PANEL — Sex Assault Density
# ==========================================
sns.barplot(
    y=df_sorted['Country'],
    x=df_sorted['Sex Assault Density'],    # <-- NEW VARIABLE
    ax=ax1,
    color='#FF1493',  # Pink-ish to signal assault-related variable
    edgecolor='black'
)

ax1.set_xlim(max(df_sorted['Sex Assault Density']) * 1.1, 0)
ax1.set_xlabel('Sex Assault Density')
ax1.set_title('Sex Assault Density', fontsize=14, fontweight='bold', color='#FF1493')
ax1.grid(axis='x', linestyle='--', alpha=0.5)

# ==========================================
# RIGHT PANEL — Religious vs Non-religious
# ==========================================
ax2.barh(
    df_sorted['Country'],
    df_sorted['Religious_Count'],
    color='#87CEFA',
    label='Religious',
    edgecolor='black',
    linewidth=0.5
)

ax2.barh(
    df_sorted['Country'],
    df_sorted['Non_Religious_Count'],
    left=df_sorted['Religious_Count'],
    color='#E6F3FF',
    label='Non-Religious',
    edgecolor='black',
    linewidth=0.5
)

ax2.set_xlabel('Population')
ax2.set_title('Population Distribution', fontsize=14, fontweight='bold', color='#2e86de')
ax2.legend()
ax2.grid(axis='x', linestyle='--', alpha=0.5)
ax2.ticklabel_format(style='plain', axis='x')

# ==========================================
# Final layout
# ==========================================
plt.subplots_adjust(wspace=0.0)

plt.suptitle(
    'Comparison: Sex Assault Density vs. Religious Composition',
    fontsize=16,
    y=1.005
)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ==========================================
# 1. Load and Clean Data
# ==========================================
try:
    # Load the dataset
    df = pd.read_csv('final_1.csv')
    
    # FIX: Remove any leading/trailing spaces from column names
    df.columns = df.columns.str.strip()
    
    print("Columns found:", df.columns.tolist()) # Debugging print

except FileNotFoundError:
    print("Error: 'final_1.csv' not found. Trying to use an existing DataFrame from the notebook (fallback).")
    # Try common fallback variables defined elsewhere in the notebook
    fallback = None
    for varname in ['df_filtered', 'df_sorted', 'merged_df', 'df_clean', 'merged_df', 'df']:
        if varname in globals() and isinstance(globals()[varname], pd.DataFrame) and not globals()[varname].empty:
            fallback = globals()[varname].copy()
            print(f"Using fallback dataframe from variable '{varname}'.")
            break
    if fallback is not None:
        df = fallback
    else:
        print("No suitable fallback dataframe found. Creating empty df.")
        df = pd.DataFrame()

# List of columns to clean (remove commas and convert to numeric)
# We include all potential column names
numeric_cols = [
    'Christians', 'Muslims', 'Buddhists', 'Hindus', 'Jews', 
    'Other_religions', 'Religiously_unaffiliated', 'Population',
    'VALUE_hom', 'VALUE_sex', 'VALUES_corr', 
    'Homicide Density', 'Sex Assault Density', 'Corruption Density'
]

for col in numeric_cols:
    if col in df.columns:
        # Remove commas if they exist and convert to numeric
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')

# ==========================================
# 2. Ensure Densities Exist (Safe Calculation)
# ==========================================

# HOMICIDE
if 'Homicide Density' not in df.columns or df['Homicide Density'].isnull().all():
    if 'VALUE_hom' in df.columns and 'Population' in df.columns:
        df['Homicide Density'] = (df['VALUE_hom'] / df['Population']) * 100
    else:
        print("⚠️ Warning: Could not calculate Homicide Density (Missing 'VALUE_hom'). Using available data.")

# SEX ASSAULT
if 'Sex Assault Density' not in df.columns or df['Sex Assault Density'].isnull().all():
    # Check for likely column names for Sex Assault values
    possible_cols = ['VALUE_sex', 'VALUE_sex_assault']
    found_col = next((c for c in possible_cols if c in df.columns), None)
    if found_col is not None and 'Population' in df.columns:
        # Safe calculation (per-100 basis)
        df['Sex Assault Density'] = (df[found_col] / df['Population']) * 100
    else:
        print("⚠️ Warning: Could not calculate Sex Assault Density. Using available data.")

# CORRUPTION
if 'Corruption Density' not in df.columns or df['Corruption Density'].isnull().all():
    if 'VALUES_corr' in df.columns and 'Population' in df.columns:
        df['Corruption Density'] = (df['VALUES_corr'] / df['Population']) * 100

# Drop rows where critical data is still missing for the plots
df_clean = df.dropna(subset=['Homicide Density', 'Population']).copy()

# ==========================================
# 3. Graphs 1 & 2: High vs Low Homicide Country Composition
# ==========================================

if not df_clean.empty:
    # Identify High and Low Homicide Countries
    high_hom_country = df_clean.loc[df_clean['Homicide Density'].idxmax()]
    low_hom_country = df_clean.loc[df_clean['Homicide Density'].idxmin()]

    # Prepare data for plotting
    comparison_countries = pd.concat([high_hom_country.to_frame().T, low_hom_country.to_frame().T])
    # Build labels safely from the two Series (avoid creating list-of-Series)
    labels = [
        f"High Homicide ({high_hom_country['Country']})",
        f"Low Homicide ({low_hom_country['Country']})"
    ]
    # ensure simple 0..n index so labels align with rows
    comparison_countries = comparison_countries.reset_index(drop=True)
    comparison_countries['Label'] = labels

    # Religion Columns
    religion_cols = ['Christians', 'Muslims', 'Buddhists', 'Hindus', 'Jews', 'Other_religions', 'Religiously_unaffiliated']
    
    # Calculate percentages for the stacked bar
    comp_data = comparison_countries.set_index('Label')[religion_cols]
    comp_data_pct = comp_data.div(comp_data.sum(axis=1), axis=0) * 100

    # Plotting Side-by-Side 100% Stacked Bar
    fig, ax = plt.subplots(figsize=(12, 6))
    comp_data_pct.plot(kind='barh', stacked=True, ax=ax, colormap='tab10', edgecolor='black')

    ax.set_xlabel('Percentage of Population')
    ax.set_title('Religious Composition: High vs. Low Homicide Countries', fontsize=15, fontweight='bold')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Religions')
    ax.set_xlim(0, 100)

    # Add text labels
    for n, x in enumerate([*comp_data_pct.index.values]):
        for (cs, ab, pc) in zip(comp_data_pct.iloc[n].cumsum(), 
                                comp_data_pct.iloc[n].cumsum() - comp_data_pct.iloc[n]/2, 
                                comp_data_pct.iloc[n]):
            if pc > 3: 
                plt.text(ab, n, str(round(pc, 1)) + '%', va='center', ha='center', color='white', fontsize=9, fontweight='bold')

    plt.tight_layout()
    plt.show()
else:
    print("Not enough data to generate Homicide comparison graphs.")

# ==========================================
# 4. Graph 3: Crime Densities by Dominant Religion
# ==========================================

if not df_clean.empty:
    # Determine "Dominant Religion"
    df_clean['Dominant_Religion'] = df_clean[religion_cols].idxmax(axis=1)

    # Group by Dominant Religion and calculate mean densities
    crime_means = df_clean.groupby('Dominant_Religion')[['Homicide Density', 'Corruption Density', 'Sex Assault Density']].mean()
    crime_means = crime_means.reset_index()

    # Melt for Grouped Bar Chart
    crime_melted = crime_means.melt(id_vars='Dominant_Religion', 
                                    var_name='Crime Type', 
                                    value_name='Average Density (%)')

    # Plotting
    plt.figure(figsize=(14, 8))
    sns.set_style("whitegrid")

    chart = sns.barplot(
        data=crime_melted,
        x='Dominant_Religion',
        y='Average Density (%)',
        hue='Crime Type',
        palette=['#d62728', '#2ca02c', '#1f77b4'],
        edgecolor='black'
    )

    plt.title('Average Crime Densities by Dominant Religion', fontsize=16, fontweight='bold')
    plt.xlabel('Dominant Religion of Country', fontsize=12)
    plt.ylabel('Average Density Score (Normalized)', fontsize=12)
    plt.xticks(rotation=45)
    plt.legend(title='Crime Type')

    for container in chart.containers:
        chart.bar_label(container, fmt='%.2f', padding=3)

    plt.tight_layout()
    plt.show()