#**Lab 5: Causal Inference & Prediction Modeling in Python**
# RPAD 676: Data Science for the Public Good

## Author:
## Date:

In [44]:
!pip install pandas numpy matplotlib seaborn statsmodels scikit-learn causalml linearmodels

In [43]:
# Load dataset
file_path = "crime_data.csv"
data = pd.read_csv(file_path)

## Exploratory Data Analysis

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
# Display basic info
data.info()
data.describe()

# Check for missing values
print(data.isnull().sum())

In [42]:
# Drop categorical columns (City and Nighttime_Lighting_Index)
df = data.drop(columns=['City', 'Nighttime_Lighting_Index'])

# Verify data types
print(df.dtypes)

# Visualize correlation matrix
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

In [42]:
# Convert all columns to numeric (ignoring errors for any non-numeric)
df = df.apply(pd.to_numeric, errors='coerce')

# Check for missing values
print(df.isnull().sum())

## Simple Linear Regression

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [42]:
# Simple Linear Regression: Predicting outcome with one predictor
X = df[['independentvariable']]
y = df['dependentvariable']
X = sm.add_constant(X)  # Adds an intercept term
model = sm.OLS(y, X).fit()

In [None]:
# Display Regression Results in a Nice Table
print("\nSimple Linear Regression Results:")
print(model.summary())

In [None]:
# Visualizing Regression Results
plt.figure(figsize=(8,6))
sns.regplot(x=df['independentvariable'], y=df['dependentvariable'], scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.xlabel("X Label")
plt.ylabel("Y Label")
plt.title("Plot Title")
plt.show()

In [42]:
# Display Regression Results in a Nice Table
print("\nSimple Linear Regression Results:")
print(model.summary())

## Multiple Regression

In [None]:
# Multiple Linear Regression: Predicting outcome using multiple predictors
X = df[['independentvariable','independentvariable']]
y = df['dependentvariable']
X = sm.add_constant(X)  # Adds an intercept term
model = sm.OLS(y, X).fit()

In [None]:
# Display Regression Results in a Nice Table
print("\nMultiple Linear Regression Results:")
print(model.summary())

## Prediction Algorithms

In [None]:
# Define predictor and target
X = df[['Unemployment_Rate']]
y = df['Violent_Crime_Rate']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")

In [44]:
# Define multiple predictors
X = df[['Unemployment_Rate', 'Poverty_Rate', 'Education_Level', 'Gun_Ownership_Rate']]
y = df['Total_Crime_Rate']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple regression model
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

# Predictions
y_pred_multi = multi_model.predict(X_test)

# Model evaluation
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred_multi):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred_multi):.2f}")