In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.impute import SimpleImputer

# Load the dataset without automatic categorical conversion
df = pd.read_stata('C:/dhs data/datasets/ZMIR71DT/ZMIR71FL.DTA', convert_categoricals=False)

# Combine b5_01 to b5_20 columns into a binary 'child_alive' column
b5_columns = [f'b5_{i:02d}' for i in range(1, 21)]
df['child_alive'] = df[b5_columns].apply(lambda x: x.any(), axis=1).astype(int)

# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['number'])

# Check top correlations with 'child_alive'
correlations = numeric_df.corr()['child_alive'].sort_values(ascending=False).head(10)
print("Top Correlations with 'child_alive':\n", correlations)

# Select the top correlated features for the model (excluding 'child_alive')
top_features = ['s1205', 'v220', 'v219', 'v218', 'v502', 'v224', 'v201', 'v535', 'v525']

# Define the feature matrix X and the target variable y
X = df[top_features]
y = df['child_alive']

# Handle missing values by filling them with the median of each column
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)  # Impute the missing values in X

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)  # Increased max_iter to avoid convergence issues
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Logistic Regression
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("AUC:", roc_auc_score(y_test, y_pred_lr))

# 2. Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_pred_rf))

# Optionally, you can add more models if needed


  df['child_alive'] = df[b5_columns].apply(lambda x: x.any(), axis=1).astype(int)


Top Correlations with 'child_alive':
 child_alive    1.000000
s1205          0.973329
v220           0.672344
v219           0.622415
v218           0.621654
v502           0.617962
v224           0.603842
v201           0.603842
v535           0.589213
v525           0.571609
Name: child_alive, dtype: float64

Logistic Regression Results:
Accuracy: 0.999269272926562
Precision: 0.9990234375
Recall: 1.0
AUC: 0.9985528219971057

Random Forest Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
AUC: 1.0
