In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('merged_dataset2022.csv')  # Replace with the actual file path if needed
# Drop the standardized score column
df = df.drop(columns=['mhi5_std_score_2022'])

# Define target and features
target_col = 'mhi5_class_2022'
X = df.drop(columns=[target_col])
y = df[target_col]

print('-------')

print(y.isna().sum())

df = df.dropna(subset=[target_col])  # Remove NaNs from df
y = df[target_col]  # Update y to match cleaned df

print(y.isna().sum())


-------
1
0


In [19]:

# Stratified train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Number of features before encoding:", X_train.shape[1])

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove the target column from lists if present (already dropped in X, but just in case)
if target_col in numeric_cols:
    numeric_cols.remove(target_col)
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

# Define the column transformer for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
    ]
)

# Fit the transformer on the training data and transform both train and test sets
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

# Get the feature names after transformation (for interpretation)
num_features = numeric_cols  # scaled numeric features retain original names
# OneHotEncoder can provide new names for each category level
cat_encoder = preprocessor.named_transformers_['cat']
cat_features = cat_encoder.get_feature_names_out(categorical_cols)
# Combine numeric and encoded categorical feature names
feature_names = list(num_features) + list(cat_features)

print("Training set shape after encoding:", X_train_enc.shape)
print("Test set shape after encoding:", X_test_enc.shape)
print("Example feature names after encoding:", feature_names[:5], "...")





ValueError: Found input variables with inconsistent numbers of samples: [995, 994]

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_enc, y_train)

# Get feature importances
importances = rf.feature_importances_
# Get indices of features sorted by importance (descending)
indices = np.argsort(importances)[::-1]

# Select top 20 features (or fewer if total features < 20)
top_n = 20 if len(feature_names) >= 20 else len(feature_names)
top_indices = indices[:top_n]
selected_feats_rf = [feature_names[i] for i in top_indices]

# Save the selected features to a CSV file
pd.DataFrame(selected_feats_rf, columns=["feature"]).to_csv("selected_features_rf.csv", index=False)

# Display the top selected features
print("Top {} features selected by Random Forest:".format(len(selected_feats_rf)))
for feat in selected_feats_rf:
    print("-", feat)





In [None]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression with L1 penalty (LASSO)
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, max_iter=1000, random_state=42)
lasso.fit(X_train_enc, y_train)

# Get the coefficients of the model
coeffs = lasso.coef_.flatten()  # flatten to 1D array
# Identify features with non-zero coefficients
selected_feats_lasso = [feature_names[i] for i, coef in enumerate(coeffs) if coef != 0.0]

# Save the selected features to a CSV file
pd.DataFrame(selected_feats_lasso, columns=["feature"]).to_csv("selected_features_lasso.csv", index=False)

# Display the selected features
print("Features selected by LASSO (non-zero coefficients):")
for feat in selected_feats_lasso:
    print("-", feat)
print(f"Total features selected by LASSO: {len(selected_feats_lasso)}")