In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

In [3]:
csv_air = pd.read_csv('Air_Quality_Index.csv')
csv_road = pd.read_csv('Road_Condition_Compressed.csv')
csv_crime = pd.read_csv('crime_filtered.csv')

In [4]:
air_cols_to_read = ['region','air_quality_PM2.5','air_quality_PM10']
df_air_raw = pd.read_csv('Air_Quality_Index.csv', usecols=air_cols_to_read)
df_air_raw['region'] = df_air_raw['region'].str.upper().str.strip()
name_corrections = {    
    'ANDAMAN AND NICOBAR ISLANDS' : 'A & N ISLANDS',
    'DADRA AND NAGAR HAVELI' : 'D & N HAVELI',
    'DAMAN AND DIU' : 'DAMAN & DIU'
}
df_air_raw['region'] = df_air_raw['region'].replace(name_corrections)
df_air = df_air_raw.groupby('region').mean().reset_index()
df_air['region'] = df_air['region'].str.upper()
print(df_air)


               region  air_quality_PM2.5  air_quality_PM10
0       A & N ISLANDS           3.400000          6.000000
1      ANDHRA PRADESH          19.714706         22.944118
2   ARUNACHAL PRADESH           2.500000          2.850000
3               ASSAM          31.047826         34.691304
4               BIHAR          43.379310         51.458621
5          CHANDIGARH         140.200000        159.900000
6        CHHATTISGARH          21.532258         25.396774
7        D & N HAVELI           9.900000         17.600000
8         DAMAN & DIU          12.150000         23.300000
9               DELHI          30.900000        116.800000
10                GOA           4.900000          6.100000
11            GUJARAT          12.221739         23.756522
12            HARYANA          46.630000        135.230000
13   HIMACHAL PRADESH          24.680000         28.640000
14  JAMMU AND KASHMIR          29.015000         37.005000
15          JHARKHAND          32.705556         41.1944

In [5]:
csv_road.rename(columns={'State/ UT': 'region'}, inplace=True)
csv_road['total_accidents'] = csv_road['Surfaced Roads-Accident - 2014'] + csv_road['Rutted/Pot holes-Accident - 2014']
df_road = csv_road[['region', 'total_accidents']]
df_road['region'] = df_road['region'].str.upper().str.strip()
print(df_road)


               region  total_accidents
0      ANDHRA PRADESH          14128.0
1   ARUNACHAL PRADESH            109.0
2               ASSAM           4965.0
3               BIHAR           6484.0
4        CHHATTISGARH           7362.0
5                 GOA           3810.0
6             GUJARAT          18402.0
7             HARYANA           7972.0
8    HIMACHAL PRADESH            818.0
9     JAMMU & KASHMIR           2395.0
10          JHARKHAND           3308.0
11          KARNATAKA          32056.0
12             KERALA          33448.0
13     MADHYA PRADESH          41665.0
14        MAHARASHTRA          49385.0
15            MANIPUR            229.0
16          MEGHALAYA            542.0
17            MIZORAM             13.0
18           NAGALAND            249.0
19             ORISSA           6456.0
20             PUNJAB           5728.0
21          RAJASTHAN          24296.0
22             SIKKIM             44.0
23         TAMIL NADU          49992.0
24          TELANGANA    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_road['region'] = df_road['region'].str.upper().str.strip()


In [6]:
csv_crime['state_ut'] = csv_crime['state_ut'].str.upper()
name_corrections = {
    'UTTARAKHAND': 'UTTARAKHAND',
    'UTTARPRADESH': 'UTTAR PRADESH'
}
csv_crime['state_ut'] = csv_crime['state_ut'].replace(name_corrections)
crime_cols = [
    '01_District_wise_crimes_committed_IPC_2001_2012_total_ipc_crimes',
    '01_District_wise_crimes_committed_IPC_2013_total_ipc_crimes'
]
csv_crime[crime_cols] = csv_crime[crime_cols].fillna(0)
df_maxes = csv_crime.groupby('state_ut')[crime_cols].max().reset_index()
df_maxes['total_crimes'] = df_maxes[crime_cols[0]] + df_maxes[crime_cols[1]]
df_crime_agg = df_maxes[['state_ut', 'total_crimes']]
df_crime_agg.rename(columns={'state_ut': 'region'}, inplace=True)
df_crime_agg['region'] = df_crime_agg['region'].str.title()
df_crime_agg['region'] = df_crime_agg['region'].str.upper().str.strip()
print(df_crime_agg)



               region  total_crimes
0       A & N ISLANDS        1592.0
1      ANDHRA PRADESH      410537.0
2   ARUNACHAL PRADESH        5215.0
3               ASSAM      164868.0
4               BIHAR      314069.0
5          CHANDIGARH        8008.0
6        CHHATTISGARH      114088.0
7        D & N HAVELI         756.0
8         DAMAN & DIU         534.0
9            DELHI UT      138147.0
10                GOA        7920.0
11            GUJARAT      287556.0
12            HARYANA      134578.0
13   HIMACHAL PRADESH       28062.0
14    JAMMU & KASHMIR       49998.0
15          JHARKHAND       89154.0
16          KARNATAKA      279011.0
17             KERALA      348471.0
18        LAKSHADWEEP         174.0
19     MADHYA PRADESH      447788.0
20        MAHARASHTRA      442553.0
21            MANIPUR        6915.0
22          MEGHALAYA        6014.0
23            MIZORAM        5165.0
24           NAGALAND        2450.0
25             ODISHA      139675.0
26         PUDUCHERRY       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crime_agg.rename(columns={'state_ut': 'region'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crime_agg['region'] = df_crime_agg['region'].str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crime_agg['region'] = df_crime_agg['region'].str.upper().str.strip()


In [7]:
master_df = pd.merge(df_air, df_road, on="region")
master_df = pd.merge(master_df, df_crime_agg, on="region")

print("\n" + "="*50)
print("✅ --- FINAL MERGED DATASET ---")
print(master_df.head())


✅ --- FINAL MERGED DATASET ---
              region  air_quality_PM2.5  air_quality_PM10  total_accidents  \
0      A & N ISLANDS           3.400000          6.000000            218.0   
1     ANDHRA PRADESH          19.714706         22.944118          14128.0   
2  ARUNACHAL PRADESH           2.500000          2.850000            109.0   
3              ASSAM          31.047826         34.691304           4965.0   
4              BIHAR          43.379310         51.458621           6484.0   

   total_crimes  
0        1592.0  
1      410537.0  
2        5215.0  
3      164868.0  
4      314069.0  


In [8]:
#Define Weights
weights = {
    'air_quality': 0.4,
    'total_accidents': 0.3,
    'total_crimes': 0.3
}

In [9]:
master_df['N_PM2_5'] = (master_df['air_quality_PM2.5'] / 30).clip(upper=1)
master_df['N_PM10'] = (master_df['air_quality_PM10'] / 50).clip(upper=1)
master_df['PollutionIndex'] = 0.6 * master_df['N_PM2_5'] + 0.4 * master_df['N_PM10']
master_df['air_safety_score'] = (1 - master_df['PollutionIndex']) * 100

#Road Condition Score
total_accidents_sum = master_df['total_accidents'].sum()
master_df['road_score'] = (1 - (master_df['total_accidents'] / total_accidents_sum)) * 100

#Crime Score
total_crimes_sum = master_df['total_crimes'].sum()
master_df['crime_score'] = (1 - (master_df['total_crimes'] / total_crimes_sum)) * 100
print(master_df.head())

              region  air_quality_PM2.5  air_quality_PM10  total_accidents  \
0      A & N ISLANDS           3.400000          6.000000            218.0   
1     ANDHRA PRADESH          19.714706         22.944118          14128.0   
2  ARUNACHAL PRADESH           2.500000          2.850000            109.0   
3              ASSAM          31.047826         34.691304           4965.0   
4              BIHAR          43.379310         51.458621           6484.0   

   total_crimes   N_PM2_5    N_PM10  PollutionIndex  air_safety_score  \
0        1592.0  0.113333  0.120000        0.116000         88.400000   
1      410537.0  0.657157  0.458882        0.577847         42.215294   
2        5215.0  0.083333  0.057000        0.072800         92.720000   
3      164868.0  1.000000  0.693826        0.877530         12.246957   
4      314069.0  1.000000  1.000000        1.000000          0.000000   

   road_score  crime_score  
0   99.934650    99.966413  
1   95.764861    91.338759  
2   9

In [10]:
master_df['risk_score'] = (
    (master_df['air_safety_score'] * weights['air_quality']) +
    (master_df['road_score'] * weights['total_accidents']) +
    (master_df['crime_score'] * weights['total_crimes'])
)
print(master_df.head())

              region  air_quality_PM2.5  air_quality_PM10  total_accidents  \
0      A & N ISLANDS           3.400000          6.000000            218.0   
1     ANDHRA PRADESH          19.714706         22.944118          14128.0   
2  ARUNACHAL PRADESH           2.500000          2.850000            109.0   
3              ASSAM          31.047826         34.691304           4965.0   
4              BIHAR          43.379310         51.458621           6484.0   

   total_crimes   N_PM2_5    N_PM10  PollutionIndex  air_safety_score  \
0        1592.0  0.113333  0.120000        0.116000         88.400000   
1      410537.0  0.657157  0.458882        0.577847         42.215294   
2        5215.0  0.083333  0.057000        0.072800         92.720000   
3      164868.0  1.000000  0.693826        0.877530         12.246957   
4      314069.0  1.000000  1.000000        1.000000          0.000000   

   road_score  crime_score  risk_score  
0   99.934650    99.966413   95.330319  
1   95.764

In [11]:
features = ['air_quality_PM2.5','air_quality_PM10', 'total_accidents', 'total_crimes']
target = 'risk_score'
X = master_df[features]
y = master_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

print("--- Model Test Results ---")

y_pred = model.predict(X_test)


r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R-squared (R²) Score: {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

--- Model Test Results ---
R-squared (R²) Score: 0.93
Mean Absolute Error (MAE): 3.31


In [12]:
head_data = master_df.head()

X_head = head_data[features]
y_actual_head = head_data[target]

y_predicted_head = model.predict(X_head)

residuals = y_actual_head - y_predicted_head

results_df = pd.DataFrame({
    'region': head_data['region'],
    'Actual Score': y_actual_head,
    'Predicted Score': y_predicted_head,
    'Residual': residuals
})

print("--- Residuals for the First 5 States ---")
print(results_df.round(2))

--- Residuals for the First 5 States ---
              region  Actual Score  Predicted Score  Residual
0      A & N ISLANDS         95.33            87.62      7.71
1     ANDHRA PRADESH         73.02            67.90      5.12
2  ARUNACHAL PRADESH         97.05            88.00      9.05
3              ASSAM         63.41            74.58    -11.17
4              BIHAR         57.43            63.77     -6.34
