In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# Step 1: Open and read the CSV file
def read_csv_file(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except pd.errors.EmptyDataError:
        print(f"Error: File '{file_path}' is empty.")
        return None

# Step 2: Prepare the data for regression
def prepare_data(df):
    # Assume the second column (index 1) is the target
    target_column = df.columns[1]
    
    # All columns except the first (index 0) and second (index 1) are features
    feature_columns = df.columns[2:]
    
    X = df[feature_columns]
    y = df[target_column]
    
    X = sm.add_constant(X)  # Add a constant term to the features
    return X, y, target_column, feature_columns

# Step 3: Perform stepwise backwards regression with Negative Binomial
def stepwise_backwards_regression(X, y, threshold=0.05):
    features = list(X.columns)
    selected_features = features.copy()
    
    while len(selected_features) > 1:  # Keep at least the constant term
        model = sm.GLM(y, X[selected_features], family=sm.families.NegativeBinomial()).fit()
        p_values = model.pvalues[1:]  # Exclude the constant term
        max_p_value = p_values.max()
        
        if max_p_value > threshold:
            feature_to_remove = p_values.idxmax()
            print(f"Removing predictor {feature_to_remove} with p-value {max_p_value}")
            selected_features.remove(feature_to_remove)
        else:
            break
    
    final_model = sm.GLM(y, X[selected_features], family=sm.families.NegativeBinomial()).fit()
    return final_model, selected_features

# Step 4: Print the results
def print_results(results, target_column, selected_features):
    print(f"Target column: {target_column}")
    print(f"Selected features: {', '.join(selected_features)}")
    print("\nRegression Results:")
    print(results.summary())

# Main function
def main():
    # Specify the file path
    file_path = r"/Users/pushtikanani/Downloads/population stratifaction (Sheet3).csv"
    
    # Read the CSV file
    df = read_csv_file(file_path)
    
    if df is not None:
        # Clean the column names
        df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
        
        # Prepare the data
        X, y, target_column, feature_columns = prepare_data(df)
        
        # Perform stepwise backwards regression
        final_model, selected_features = stepwise_backwards_regression(X, y)
        
        # Print the results
        print_results(final_model, target_column, selected_features)

if __name__ == "__main__":
    main()


Removing predictor x11_=_other_race_portion_of_fentanyl_deaths_per_zipcode_(3.9%) with p-value 0.9934395141566654
Removing predictor x4_=_fentanyl_deaths_(35-50) with p-value 0.9934817229995863
Removing predictor x2_=_fentanyl_deaths_(18-25) with p-value 0.9678583509791486
Removing predictor x7_=_white_portion_of_fentanyl_deaths_per_zipcode_(47%) with p-value 0.9383999323607128
Removing predictor x5_=_fentanyl_deaths_(>50) with p-value 0.8492492721210433
Removing predictor x9_=_asian_portion_of_fentanyl_deaths_per_zipcode_(4.1%) with p-value 0.7924035394410931
Removing predictor x6_=_educational_attainment_of_18_and_above_for_bachelor_and_above_degrees with p-value 0.8391483875174672
Removing predictor x8_=_black_portion_of_fentanyl_deaths_per_zipcode_(15.5%) with p-value 0.7807235190945669
Removing predictor x1_=_fentanyl_deaths_(0<18) with p-value 0.7368166618178225
Removing predictor x3_=_fentanyl_deaths_(26-34) with p-value 0.6513040356456961
Target column: y_=_stratification_(depe

