# Question 1

In [None]:
!pip install dowhy
!pip install graphviz
!pip install efficient-apriori
!pip install pydot

In [None]:
import numpy as np
import pandas as pd
from dowhy import CausalModel

In [None]:
import kagglehub
import os

# Download latest version
#PATH = kagglehub.dataset_download("stackoverflow/stack-overflow-2018-developer-survey")
#csv_file_path = os.path.join(PATH, 'survey_results_public.csv')

csv_file_path = './survey_results_public.csv'

df = pd.read_csv(csv_file_path)
df.head()



In [None]:
# -------- Data Preparation --------
dag_variables = [
    'RaceEthnicity', 'Gender', 'Age', 'DevType', 'FormalEducation', 'UndergradMajor',
    'ConvertedSalary']

df_dag = df[dag_variables].copy()

for col in df_dag.select_dtypes(include=['float64', 'int64']).columns:
    df_dag[col] = df_dag[col].fillna(df_dag[col].mean())

for col in df_dag.select_dtypes(include=['object']).columns:
    df_dag[col] = df_dag[col].fillna(df_dag[col].mode()[0])

# binary treatment vars
masters_df = df_dag.copy()
masters_df['FormalEducation'] = (masters_df['FormalEducation'] == "Master’s degree (MA, MS, M.Eng., MBA, etc.)").astype(int)

bachelors_df = df_dag.copy()
bachelors_df['FormalEducation'] = (bachelors_df['FormalEducation'] == "Bachelor’s degree (BA, BS, B.Eng., etc.)").astype(int)

In [None]:
def compute_causal_effect(treatment, outcome, df, causal_graph):

    model = CausalModel(
        data=df,
        graph=causal_graph.replace("\n", " "),
        treatment= treatment,
        outcome=outcome)

#     model.view_model()

    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
    print(identified_estimand)

    estimands = model.identify_effect()

    causal_estimate_reg = model.estimate_effect(estimands,
                                                    method_name="backdoor.linear_regression",
                                                    test_significance=True)
    return causal_estimate_reg.value, causal_estimate_reg.test_stat_significance()['p_value']

In [None]:
DAG = [
    'RaceEthnicity;',
    'Gender;',
    'Age;',
    'DevType;',
    'FormalEducation;',
    'UndergradMajor;',
    'ConvertedSalary;',
    'RaceEthnicity -> ConvertedSalary;',
    'Gender -> DevType;',
    'Gender -> UndergradMajor;',
    'UndergradMajor -> DevType;',
    'UndergradMajor -> FormalEducation;',
    'UndergradMajor -> ConvertedSalary;',
    'Age -> DevType;',
    'FormalEducation -> DevType;',
    'FormalEducation -> ConvertedSalary;',
    'RaceEthnicity -> FormalEducation;',
    'RaceEthnicity -> DevType;',
    'Gender -> FormalEducation;',
    'Age -> FormalEducation;',
    'DevType -> ConvertedSalary;',
    'Gender -> ConvertedSalary;',
    'Age -> ConvertedSalary;'
]
causal_graph = """
                    digraph {
                """
for line in DAG:
    causal_graph = causal_graph + line + "\n"
causal_graph = causal_graph + "}"


#add your code here to compute the required causal effects

#example:
# ATE, p_val = compute_causal_effect('Gender', 'ConvertedSalary', df, causal_graph)
# print('ATE: ', ATE, 'P-val: ',p_val)


In [None]:
# (a) ATE for "Master's degree" on ConvertedSalary

ATE_master, p_val_master = compute_causal_effect('FormalEducation', 'ConvertedSalary', masters_df, causal_graph)
print(f"ATE (Master's degree): {ATE_master}, P-value: {p_val_master}")

# (b) ATE for "Bachelor's degree" on ConvertedSalary
ATE_bachelor, p_val_bachelor = compute_causal_effect('FormalEducation', 'ConvertedSalary', bachelors_df, causal_graph)
print(f"ATE (Bachelor's degree): {ATE_bachelor}, P-value: {p_val_bachelor}")

In [None]:
#q5
# Analyze salary distributions by Age, Gender, and RaceEthnicity
group_analysis = df_dag.groupby(['Age', 'Gender', 'RaceEthnicity'])['ConvertedSalary'].mean().sort_values()
print(group_analysis)

In [None]:
# Define treatment group

df_dag['Treatment_Group'] = (
    (df_dag['Age'] == '18 - 24 years old') &
    (df_dag['Gender'] == 'Female') &
    (df_dag['RaceEthnicity'] == 'Black or of African descent;Native American, Pacific Islander, or Indigenous Australian;White or of European descent')
).astype(int)

#Another one we decided not to work with

# df['Treatment_Group'] = (
#     (df['Age'] == 'Under 18 years old') &
#     (df['Gender'] == 'Male') &
#     (df['RaceEthnicity'] == 'East Asian')
# ).astype(int)

In [None]:
# Modify the DAG
DAG = [
    'RaceEthnicity;',
    'Gender;',
    'Age;',
    'DevType;',
    'FormalEducation;',
    'UndergradMajor;',
    'ConvertedSalary;',
    'Treatment_Group;', # Added
   # 'RaceEthnicity -> ConvertedSalary;',
   # 'Gender -> DevType;',
   # 'Gender -> UndergradMajor;',
    'UndergradMajor -> DevType;',
    'UndergradMajor -> FormalEducation;',
    'UndergradMajor -> ConvertedSalary;',
   # 'Age -> DevType;',
    'FormalEducation -> DevType;',
    'FormalEducation -> ConvertedSalary;',
   # 'RaceEthnicity -> FormalEducation;',
   # 'RaceEthnicity -> DevType;',
   # 'Gender -> FormalEducation;',
   # 'Age -> FormalEducation;',
    'DevType -> ConvertedSalary;',
   # 'Gender -> ConvertedSalary;',
   # 'Age -> ConvertedSalary;'
    'RaceEthnicity -> Treatment_Group;', # Added from here
    'Gender -> Treatment_Group;',
    'Age -> Treatment_Group;',
    'Treatment_Group -> DevType;'
    'Treatment_Group -> FormalEducation;'
    'Treatment_Group -> UndergradMajor;'
    'Treatment_Group -> ConvertedSalary;'

]

causal_graph = """
                    digraph {
                """
for line in DAG:
    causal_graph = causal_graph + line + "\n"
causal_graph = causal_graph + "}"

In [None]:
ATE, p_val = compute_causal_effect('Treatment_Group', 'ConvertedSalary', df_dag, causal_graph)
print(f"ATE (Bias against Treatment Group): {ATE}, P-value: {p_val}")


# Question 3

In [None]:
!pip install efficient-apriori


In [None]:
from efficient_apriori import apriori
import pandas as pd

# -------- Data Preparation --------
df = pd.read_csv('./loan_approval_dataset.csv')

for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

df[' loan_amount'] = pd.cut(
    df[' loan_amount'],
    bins=[300000, 7700000, 14500000, 21500000, 39500000],
    labels=['300K-7.7M', '7.7M-14.5M', '14.5M-21.5M', '21.5M-39.5M']
)

df[' loan_status'] = (df[' loan_status'] == ' Approved').astype(int)
df[' self_employed'] = (df[' self_employed'] == ' Yes').astype(int)
df[' education'] = (df[' education'] == ' Graduate').astype(int)

# -------- Convert to Transactions (q1)--------
def dataframe_to_transactions(df):
    transactions = []
    for _, row in df.iterrows():
        transaction = [f"{col}:{row[col]}" for col in df.columns]
        transactions.append(transaction)
    return transactions

transactions = dataframe_to_transactions(df)

# -------- Execute Apriori (q2)--------
min_support = 0.15
min_confidence = 0.5

itemsets, rules = apriori(transactions, min_support=min_support, min_confidence=min_confidence)

overrepresented_rules = [rule for rule in rules if rule.lift > 1.05]

print("\nFrequent Itemsets:")
for itemset, support in itemsets.items():
    print(f"{itemset}: {support}")

print("\nOver-represented Subpopulations:")
if overrepresented_rules:
    for rule in overrepresented_rules:
        print(rule)
else:
    print("No significantly over-represented subpopulations found.")


In [None]:
from efficient_apriori import apriori

# -------- Execute Apriori (q3)--------
min_support = 0.05
min_confidence = 0.6

itemsets, rules = apriori(transactions, min_support=min_support, min_confidence=min_confidence)
loan_term_rules = [rule for rule in rules if 'loan_term' in str(rule)]

print("\nLoan Term Related Association Rules:")
if loan_term_rules:
    for rule in loan_term_rules:
        print(rule)
else:
    print("No rules involving loan_term were found.")



# Question 4

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# -------- Data Preparation --------
df = pd.read_csv('./loan_approval_dataset.csv')

for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

df[' loan_amount'] = pd.cut(
    df[' loan_amount'],
    bins=[300000, 7700000, 14500000, 21500000, 39500000],
    labels=['300K-7.7M', '7.7M-14.5M', '14.5M-21.5M', '21.5M-39.5M']
)
df[' loan_status'] = (df[' loan_status'] == ' Approved').astype(int)

# -------- Calculate average loan status for each loan_amount bin --------
loan_amount_avg_status = df.groupby(' loan_amount')[' loan_status'].mean()

# -------- Plot the results (q2) --------
plt.figure(figsize=(8, 6))
loan_amount_avg_status.plot(kind='bar', color='skyblue')
plt.title('Average Loan Status by Loan Amount', fontsize=16)
plt.xlabel('Loan Amount Range', fontsize=14)
plt.ylabel('Average Loan Status', fontsize=14)
plt.grid(axis='y')
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import entropy
import numpy as np
import matplotlib.pyplot as plt


# -------- Overall distribution of loan statuses ---
overall_distribution = df[' loan_status'].value_counts(normalize=True)

# -------- Subpopulations based on loan_amount + loan_status distributions ---
divergence_results = {}

for loan_bin in df[' loan_amount'].unique():
    subpop = df[df[' loan_amount'] == loan_bin]
    subpop_distribution = subpop[' loan_status'].value_counts(normalize=True)
    subpop_distribution = subpop_distribution.reindex(overall_distribution.index, fill_value=0)
    kl_div = entropy(subpop_distribution, overall_distribution)
    divergence_results[loan_bin] = kl_div

# DataFrame as it was easier for plotting
divergence_df = pd.DataFrame({
    'Loan Amount Range': [bin for bin in divergence_results.keys()],
    'KL Divergence': [value for value in divergence_results.values()]
}).sort_values(by='KL Divergence', ascending=False)


print("Most Divergent Subpopulation:")
if not divergence_df.empty:
    most_divergent = divergence_df.iloc[0]
    print(f"Loan Amount Range: {most_divergent['Loan Amount Range']}, KL Divergence: {most_divergent['KL Divergence']:.4f}")
else:
    print("No significant divergent subpopulations found.")


# -------- Plot the results (q3) --------
divergence_df['Loan Amount Range'] = divergence_df['Loan Amount Range'].astype(str)

plt.figure(figsize=(8, 6))
plt.bar(divergence_df['Loan Amount Range'], divergence_df['KL Divergence'], color='salmon')
plt.title('KL Divergence by Loan Amount Range', fontsize=16)
plt.xlabel('Loan Amount Range', fontsize=14)
plt.ylabel('KL Divergence', fontsize=14)
plt.grid(axis='y')
plt.xticks(rotation=45, fontsize=12)
plt.tight_layout()
plt.show()


