In [1]:
#Question to answer: What is the demographic factors associated with higher rates of Gender-Based Violence?
#Using linear regression model which is best for predicting the rate of gender-based violence incidents.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

#Source for learning code to this week: https://www.geeksforgeeks.org/python-linear-regression-using-sklearn/

In [2]:
# Load Incident Data based on target objective
citizen_caw = pd.read_csv('citizen_caw rates.csv')
admin_long = pd.read_csv('admin_long data.csv')
admin_wide = pd.read_csv('admin_wide data.csv')

# Load Demographic Data based on target objective
survey_endline = pd.read_csv('user survey_endline data.csv')
citizen_full = pd.read_csv('citizen_full data.csv ')
#citizen_caw = pd.read_csv('citizen_caw rates.csv')

# Merge data on common field (ps_code or dist_id)
merge_1 = pd.merge(citizen_caw, survey_endline, on='ps_code', how='inner')
merge_2 = pd.merge(citizen_caw, admin_wide, on='ps_code', how='outer')
merge_3 = pd.merge(merge_2, admin_long, on='ps_code', how='outer')
final_data = pd.merge(merge_3, merge_1, on='ps_code', how='inner')

# Display the first few rows of the final merged dataset
print(final_data.head())

   Unnamed: 0_x_x  ps_code  population_x_x urban_x_x  dist_urban_x_x  \
0               0   1001.0          129345     Rural             1.0   
1               0   1001.0          129345     Rural             1.0   
2               0   1001.0          129345     Rural             1.0   
3               0   1001.0          129345     Rural             1.0   
4               0   1001.0          129345     Rural             1.0   

   strat_pca_x_x treatment_x_x       group_x_x  dist_id_x_x  \
0      -0.234041     Treatment  women officers       1000.0   
1      -0.234041     Treatment  women officers       1000.0   
2      -0.234041     Treatment  women officers       1000.0   
3      -0.234041     Treatment  women officers       1000.0   
4      -0.234041     Treatment  women officers       1000.0   

   implement_quality_x_x  ...  dist_urban_y_y  strat_pca_y_y  treatment_y_y  \
0                    9.0  ...             1.0      -0.234041      Treatment   
1                    9.0  ... 

In [3]:
# Eliminate missing data
final_data.ffill(inplace=True)

In [4]:
# Normalize merged data
scaler = StandardScaler()  
numerical_columns = final_data.select_dtypes(include=['float64', 'int64']).columns
final_data[numerical_columns] = scaler.fit_transform(final_data[numerical_columns])

In [5]:
# Encode categorical variables
encoder = LabelEncoder()
categorical_columns = final_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    final_data[col] = encoder.fit_transform(final_data[col])

In [6]:
# Define incident rate
#e_caw_rate: This represents the endline rate of Crimes Against Women (CAW) incidents
y = final_data['e_caw_rate'] 

# Demographics columns except the target
X = final_data.drop(columns=['e_caw_rate'])  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Remove duplication in the data
print("Original Columns:", final_data.columns)

# Remove duplicates in the data
columns_to_drop = [col for col in final_data.columns if '_x' in col or '_y' in col]

# Drop identified duplicate columns
cleaned_data = final_data.drop(columns=columns_to_drop)

# Check cleaned columns
print("Cleaned Columns:", cleaned_data.columns)

Original Columns: Index(['Unnamed: 0_x_x', 'ps_code', 'population_x_x', 'urban_x_x',
       'dist_urban_x_x', 'strat_pca_x_x', 'treatment_x_x', 'group_x_x',
       'dist_id_x_x', 'implement_quality_x_x', 'training_score_x_x',
       'comm_outreach_strength_x_x', 'regular_whd_x_x', 'women_whd_x_x',
       'b_caw_rate_x', 'e_caw_rate_x', 'Unnamed: 0_y_x', 'urban_y_x',
       'dist_urban_y_x', 'strat_pca_y_x', 'treatment_y_x', 'dist_id_y_x',
       'regular_whd_y_x', 'women_whd_y_x', 'e_caw_rate_y', 'month_x',
       'b_fir_overall_count', 'b_fir_caw_count', 'b_fir_bywomen_count',
       'b_arrest_count', 'b_dir_count', 'b_ncr_count', 'b_dial100_count',
       'e_fir_overall_count', 'e_fir_caw_count', 'e_fir_bywomen_count',
       'e_arrest_count', 'e_dir_count', 'e_ncr_count', 'e_dial100_count',
       'implement_quality_y_x', 'training_score_y_x',
       'comm_outreach_strength_y_x', 'Unnamed: 0', 'month_y',
       'fir_overall_count', 'fir_caw_count', 'fir_bywomen_count',
       'arres

In [8]:
# Define the target and features
y = cleaned_data['e_caw_rate']  # Target variable: incident rate
X = cleaned_data.drop(columns=['e_caw_rate'])  # Features: All columns except the target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Regression with Linear Regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

# Evaluate the regression model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (R²) Score: {r2}")

Mean Squared Error: 0.8475144479648719
R-squared (R²) Score: 0.14492631633813435


In [9]:
# Access model coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': regressor.coef_
})

# Display coefficients sorted by importance
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)
print(coefficients)

                   Feature   Coefficient
27                 dist_id  3.811612e+00
28       implement_quality  2.869338e-01
23                   urban  2.425455e-01
14         e_dial100_count  1.839876e-01
13             e_ncr_count  1.336669e-01
11          e_arrest_count  8.891905e-02
2          b_fir_caw_count  8.358816e-02
22           dial100_count  6.268610e-02
19            arrest_count  5.712257e-02
21               ncr_count  4.428362e-02
17           fir_caw_count  4.383356e-02
40              fclitysats  4.310445e-02
8      e_fir_overall_count  4.165154e-02
36               visitsats  2.791694e-02
16       fir_overall_count  1.508604e-02
39              resolution  1.226072e-02
9          e_fir_caw_count  1.154691e-02
6              b_ncr_count  5.042729e-03
7          b_dial100_count  3.231898e-03
38                 respect  2.843185e-03
32               women_whd  2.219533e-03
34               consented -2.739475e-14
33                     uid -4.771770e-05
35              