In [50]:
# Imports
import os
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

warnings.simplefilter(action='ignore', category=DeprecationWarning)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [51]:
def column_select(df, columns_to_select):
    selected_columns = []
    for column_name in columns_to_select:
        # Check if the column exists in the DataFrame
        if column_name in df.columns:
            # If the column exists, select it and add it to the list of selected columns
            selected_columns.append(column_name)
        else:
            # If the column does not exist, issue a warning
            print(f"Column '{column_name}' does not exist in the DataFrame.")

    # Select the columns from the DataFrame
    return df[selected_columns]


In [52]:
taluker = ["hc31", "hc61", "hv270", "v025"]                                                                             # 2014
# ha40 is bmi which is used as target feature
islam = ["v012", "v025", "hv270", "v106", "v404", "v714","v213", "v201", "v701", "v113", "v116","v130", 'v161', 'ha40']        # 2014

# dropped ha66, always NaN




In [53]:
BDHS_14_child = pd.read_stata("./data/2014/BDKR72FL.DTA")
BDHS_14_household = pd.read_stata("./data/2014/BDHR72FL.DTA")
BDHS_14_individual = pd.read_stata("./data/2014/BDIR72FL.DTA")
BDHS_14_hh_member = pd.read_stata("./data/2014/BDPR72FL.DTA")
BDHS_14 = pd.concat([BDHS_14_child,BDHS_14_household, BDHS_14_individual, BDHS_14_hh_member], axis =1)


In [54]:
# Define the bins and labels
bins = [0, 1850, 2490, float('inf')]
labels = ['underweight', 'normal', 'obese']

age_bins = [15, 20, 30, 40, 50]
age_labels = ['15-19', '20-29', '30-39', '40-49']

wealth_bins = [-float('inf'), 'poorer', 'middle', 'richer', float('inf')]
wealth_labels = ['poor', 'middle', 'rich']

children_bins = [-1, 0, 2, 4, 20]
children_labels = ['None', '1-2', '3-4', '5 or more']



In [55]:
islam_df = column_select(BDHS_14, islam)
islam_df = islam_df.loc[:, ~islam_df.columns.duplicated()]
islam_df['ha40'] = pd.to_numeric(islam_df['ha40'], errors='coerce')
islam_df['label'] = pd.cut(islam_df['ha40'], bins=bins, labels=labels, right=False)
islam_df['v012'] = pd.cut(islam_df['v012'], bins=age_bins, labels=age_labels, right=False)
#islam_df['hv270'] = pd.cut(islam_df['hv270'], bins=wealth_bins, labels=wealth_labels, right=False)
islam_df['v201'] = pd.cut(islam_df['v201'], bins=children_bins, labels=children_labels, right=False)


threshold = int(0.3 * len(islam_df.columns))
islam_df.dropna(thresh=threshold, inplace=True)

islam_df = islam_df[islam_df['label'].notna()]
islam_df = islam_df.drop('ha40', axis=1)
print(islam_df)

       v012   v025    hv270          v106 v404 v714          v213       v201  \
1     20-29  rural   middle  no education   no  yes  no or unsure        3-4   
7     20-29  rural  poorest       primary  yes   no  no or unsure        3-4   
10    20-29  rural   poorer     secondary  yes   no  no or unsure        3-4   
11    20-29  rural  poorest     secondary  yes   no  no or unsure        3-4   
21    40-49  rural  poorest  no education   no  yes  no or unsure  5 or more   
26    20-29  rural   poorer     secondary   no   no           yes        1-2   
29    30-39  rural  poorest  no education  yes   no  no or unsure  5 or more   
35    40-49  rural   middle     secondary   no   no  no or unsure  5 or more   
38    20-29  rural   poorer        higher   no   no  no or unsure        1-2   
42    30-39  rural  poorest       primary   no   no  no or unsure        3-4   
44    20-29  rural   poorer       primary   no   no  no or unsure        3-4   
47    20-29  rural   poorer  no educatio

In [56]:
islam_df.dtypes

v012     category
v025     category
hv270    category
v106     category
v404     category
v714     category
v213     category
v201     category
v701     category
v113     category
v116     category
v130     category
v161     category
label    category
dtype: object

In [57]:
islam_df['v113'].unique()

['tube well or borehole', 'river/dam/lake/ponds/stream/canal/irrigation ..., 'not a dejure resident', 'public tap/standpipe', 'rainwater', ..., 'piped into dwelling', 'piped to yard/plot', 'bottled water', 'unprotected well', 'unprotected spring']
Length: 12
Categories (15, object): ['piped into dwelling' < 'piped to yard/plot' < 'public tap/standpipe' < 'tube well or borehole' ... 'cart with small tank' < 'bottled water' < 'other' < 'not a dejure resident']

In [58]:

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [500],  # Number of trees in the forest
    'max_depth': [ 8,9, 10, 11,12, None],       # Maximum depth of the tree
    #'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    #'min_samples_leaf': [ 2, 4,8, 16]      # Minimum number of samples required to be at a leaf node
}

In [59]:
from sklearn.feature_extraction import FeatureHasher


y = islam_df['label']
X = islam_df.drop(['label'], axis=1)

X = X.astype(str)

# Get the indices of the categorical columns
categorical_indices = [i for i, col in enumerate(X.columns) if X[col].dtype.name == 'object']
#print(categorical_indices)

# Define the transformer for one-hot encoding
#ct = ColumnTransformer([('encoder', OneHotEncoder(), categorical_indices)])

# Fit and transform the data
#X_encoded = ct.fit_transform(X)


hasher = FeatureHasher(n_features=10, input_type='string')
X_encoded = hasher.fit_transform(X.values)


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Further split train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2







# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV instance to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Predict on the test set
y_pred = grid_search.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

Best Parameters: {'max_depth': 8, 'n_estimators': 500}
Best Score: 0.6081979635330335
Accuracy: 0.6081871345029239
