In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd


df=pd.read_csv('../data/cleaned/cleaned_missing_data.csv')

# Assume 'high_risk' label (e.g., 1 if high female/minor)
le = LabelEncoder()
df['state_encoded'] = le.fit_transform(df['state_name'])

X = df[['total_minors', 'total_females', 'state_encoded']]  # Add more features
y = df['is_high_minor']  # Or trafficking proxy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98       628
        True       0.89      0.75      0.82        68

    accuracy                           0.97       696
   macro avg       0.93      0.87      0.90       696
weighted avg       0.97      0.97      0.97       696



| Metric        | What it means                                                | False (class 0)                                            | True (class 1)                                     |
| ------------- | ------------------------------------------------------------ | ---------------------------------------------------------- | -------------------------------------------------- |
| **Precision** | Of all cases predicted as this class, how many were correct? | 0.97 → Very few false positives for non-high-risk          | 0.76 → 76% of predicted high-risk were correct     |
| **Recall**    | Of all actual cases in this class, how many did we catch?    | 0.98 → Almost all non-high-risk cases were correctly found | 0.65 → Model missed some actual high-risk cases    |
| **F1-score**  | Balance of precision and recall                              | 0.98 → Excellent for non-high-risk                         | 0.70 → Decent, but needs improvement for high-risk |
| **Support**   | Number of actual cases in that class                         | 639 non-high-risk                                          | 57 high-risk                                       |


Overall scores
Accuracy (0.95) → 95% of predictions were correct overall.

Macro avg (0.86 precision, 0.82 recall) → Average performance for both classes equally.

Weighted avg (0.95 precision, 0.95 recall) → Same as accuracy but weighted by how many cases in each class.

In [3]:
df.head()

Unnamed: 0,id,year,state_name,state_code,district_name,district_code,registration_circles,male_below_5_years,male_5_to_14_years,male_14_to_18_years,...,trangender_18_to_30_years,trangender_30_to_45_years,trangender_45_to_60_years,transgender_60_years_and_above,total_missing,total_minors,is_high_minor,total_females,is_high_female,state_encoded
0,0,2017,Andhra Pradesh,28,Anantapur,502,Anantapur,1,26.0,34.0,...,0.0,0.0,0.0,0.0,795.0,297.0,False,581.0,True,1
1,1,2017,Andhra Pradesh,28,Chittoor,503,Chittoor,0,15.0,12.0,...,0.0,0.0,0.0,0.0,320.0,121.0,False,233.0,True,1
2,2,2017,Andhra Pradesh,28,Y.S.R.,504,Cuddapah,6,18.0,11.0,...,2.0,0.0,0.0,0.0,330.0,95.0,False,209.0,True,1
3,3,2017,Andhra Pradesh,28,East Godavari,505,East Godavari,0,0.0,49.0,...,0.0,0.0,0.0,0.0,668.0,167.0,False,397.0,True,1
4,4,2017,Andhra Pradesh,28,Anantapur,502,Guntakal Railway,0,0.0,1.0,...,0.0,0.0,0.0,0.0,12.0,1.0,False,3.0,False,1
