In [23]:
# Initial imports.
import pandas as pd
import matplotlib.pyplot as plt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [24]:
# Loading data
df = pd.read_csv('Resources/converted_oecd_cleaned.csv')
df.head()

Unnamed: 0,Country,Country_Index,OECD,Child_marriage_Law,Child_marriage_Practice,Household_responsibilities_Law,Divorce_Law,Violence_against_women_Law,Female_genital_mutilation_Law,Reproductive_autonomy_Law,Secure_access_to_land_assets_Law,Access_to_non-land_assets_Law,Secure_access_to_formal_financial services_Law,Workplace_rights_Law,Citizenship_rights_Law,Political_voice_Law,Political_voice_Practice,Freedom_of_movement_Law,Access_to_justice_Law
0,Australia,0,1,0.5,0,0.5,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.5,28.7,0.0,0.0
1,Austria,1,1,0.5,1,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,34.4,0.0,0.0
2,Belgium,2,1,0.5,1,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,0.0
3,Canada,3,1,0.75,1,0.5,0.0,0.25,0.25,0.0,0.5,0.0,0.0,0.0,0.0,0.5,27.0,0.0,0.0
4,Chile,4,1,0.5,1,0.75,0.25,0.75,1.0,0.5,0.75,1.0,0.0,1.0,0.0,0.0,22.6,0.0,0.0


In [25]:
# Clean the data
df = df.drop(["Political_voice_Practice"], axis = 1)

In [26]:
df = df.drop(["Country"], axis = 1)
df = df.drop(["Country_Index"], axis = 1)
df

Unnamed: 0,OECD,Child_marriage_Law,Child_marriage_Practice,Household_responsibilities_Law,Divorce_Law,Violence_against_women_Law,Female_genital_mutilation_Law,Reproductive_autonomy_Law,Secure_access_to_land_assets_Law,Access_to_non-land_assets_Law,Secure_access_to_formal_financial services_Law,Workplace_rights_Law,Citizenship_rights_Law,Political_voice_Law,Freedom_of_movement_Law,Access_to_justice_Law
0,1,0.50,0,0.50,0.00,0.75,0.00,0.00,0.00,0.00,0.00,0.25,0.00,0.50,0.00,0.00
1,1,0.50,1,0.50,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.50,0.00,0.00
2,1,0.50,1,0.50,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,1,0.75,1,0.50,0.00,0.25,0.25,0.00,0.50,0.00,0.00,0.00,0.00,0.50,0.00,0.00
4,1,0.50,1,0.75,0.25,0.75,1.00,0.50,0.75,1.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0,0.50,1,0.50,0.75,0.25,1.00,0.75,0.00,0.00,0.00,0.75,0.00,0.00,0.00,0.00
176,0,0.25,1,0.25,0.25,0.75,1.00,0.00,0.25,0.25,0.25,1.00,0.00,0.00,0.00,0.00
177,0,0.75,1,1.00,1.00,0.75,1.00,0.75,0.25,0.00,0.00,1.00,1.00,0.50,1.00,0.75
178,0,0.50,1,0.50,0.25,0.50,0.75,0.75,0.25,0.00,0.25,0.75,0.00,0.50,0.75,0.75


In [27]:
# Define the features set.
X = df.copy()
X = X.drop("Child_marriage_Practice", axis = 1)
X.head()

Unnamed: 0,OECD,Child_marriage_Law,Household_responsibilities_Law,Divorce_Law,Violence_against_women_Law,Female_genital_mutilation_Law,Reproductive_autonomy_Law,Secure_access_to_land_assets_Law,Access_to_non-land_assets_Law,Secure_access_to_formal_financial services_Law,Workplace_rights_Law,Citizenship_rights_Law,Political_voice_Law,Freedom_of_movement_Law,Access_to_justice_Law
0,1,0.5,0.5,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.5,0.0,0.0
1,1,0.5,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
2,1,0.5,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.75,0.5,0.0,0.25,0.25,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0
4,1,0.5,0.75,0.25,0.75,1.0,0.5,0.75,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [28]:
# Define the target set.
y = df["Child_marriage_Practice"].ravel()
y[:5]

array([0, 1, 1, 1, 1], dtype=int64)

In [29]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [30]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [32]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [33]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [34]:
# Calculating the confusion
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,7
Actual 1,1,37


In [35]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [36]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,7
Actual 1,1,37


Accuracy Score : 0.8222222222222222
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.84      0.97      0.90        38

    accuracy                           0.82        45
   macro avg       0.42      0.49      0.45        45
weighted avg       0.71      0.82      0.76        45

