In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Cell 2: Load Data
X_train_full = pd.read_csv("X_train_income.csv")
Y_train_full = pd.read_csv("Y_train_income.csv")
X_test_final = pd.read_csv("X_test_income.csv")

print("Data loaded successfully!")
print(f"X_train shape: {X_train_full.shape}")
print(f"Y_train shape: {Y_train_full.shape}")
print(f"X_test shape: {X_test_final.shape}")

In [None]:
# Cell 3: Explore Data (Optional)
print("\nFirst few rows of X_train:")
print(X_train_full.head())
print("\nColumn types:")
print(X_train_full.dtypes)
print("\nTarget distribution:")
print(Y_train_full.iloc[:, 0].value_counts())

In [None]:
# Cell 4: Preprocessing

# Combine X and y
data = X_train_full.copy()
data['income'] = Y_train_full.iloc[:, 0].values

print(f"Original data shape: {data.shape}")

# STEP 1: Replace '?' with NaN
data = data.replace('?', pd.NA)

# STEP 2: Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# Remove 'income' from categorical (we'll handle it separately)
if 'income' in categorical_cols:
    categorical_cols.remove('income')

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# STEP 3: Encode ALL categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Fill NaN with 'missing' placeholder, then encode
    data[col] = data[col].fillna('missing')
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}")

# STEP 4: Fill missing values in numerical columns
for col in numerical_cols:
    if data[col].isna().sum() > 0:
        median_val = data[col].median()
        data[col] = data[col].fillna(median_val)
        print(f"Filled {col} missing values with median: {median_val}")

# STEP 5: Encode target variable
if data['income'].dtype == 'object':
    le_target = LabelEncoder()
    data['income'] = le_target.fit_transform(data['income'].astype(str))
    print(f"Encoded income. Classes: {le_target.classes_}")

# STEP 6: Final verification
object_cols_remaining = data.select_dtypes(include=['object']).columns.tolist()
if len(object_cols_remaining) > 0:
    print(f"WARNING: Still have object columns: {object_cols_remaining}")
else:
    print("All columns are now numeric!")

print(f"\nFinal data types:\n{data.dtypes.value_counts()}")

In [None]:
# Cell 5: Train/Test Split
X = data.drop('income', axis=1)
y = data['income']

# Verify X is all numeric before splitting
print(f"X shape: {X.shape}")
print(f"X dtypes:\n{X.dtypes.value_counts()}")

# Check for any non-numeric data
if X.select_dtypes(include=['object']).shape[1] > 0:
    print("ERROR: X still contains non-numeric columns")
    print(X.select_dtypes(include=['object']).columns.tolist())
else:
    print("X is all numeric")

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Cell 6: Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    max_depth=10,
    n_jobs=-1  # Use all CPU cores
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train)
print("Training complete!")

In [None]:
# Cell 7: Evaluate Model
y_pred_rf = rf_model.predict(X_test)
report = classification_report(y_test, y_pred_rf)

print("Classification Report:")
print(report)

# Save report
with open("RF_report.txt", "w") as f:
    f.write(report)
print("Report saved to RF_report.txt")

In [None]:
# Cell 8: Preprocess Final Test Set
X_test_final_processed = X_test_final.copy()

for col in label_encoders:
    if col in X_test_final_processed.columns:
        le = label_encoders[col]
        X_test_final_processed[col] = X_test_final_processed[col].apply(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )

X_test_final_processed = X_test_final_processed.fillna(
    X_test_final_processed.median(numeric_only=True)
)

print("Final test set preprocessed!")

In [None]:
# Cell 9: Make and Save Predictions
predictions = rf_model.predict(X_test_final_processed)

pd.DataFrame(predictions).to_csv(
    'y_predict_RF.csv', index=False, header=False
)

print(f"Predictions saved to y_predict_RF.csv")
print(f"Number of predictions: {len(predictions)}")
print(f"Prediction distribution: {pd.Series(predictions).value_counts()}")