In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Cell 2: Load Data
X_train_full = pd.read_csv("X_train_income.csv")
Y_train_full = pd.read_csv("Y_train_income.csv")
X_test_final = pd.read_csv("X_test_income.csv")

print("Data loaded successfully!")
print(f"X_train shape: {X_train_full.shape}")
print(f"Y_train shape: {Y_train_full.shape}")
print(f"X_test shape: {X_test_final.shape}")

In [None]:
# Cell 3: Preprocessing (Same as RF)

# Combine X and y
data = X_train_full.copy()
data['income'] = Y_train_full.iloc[:, 0].values

print(f"Original data shape: {data.shape}")
print(f"Original dtypes:\n{data.dtypes}\n")

# Step 1: Handle missing values FIRST (before encoding)
# Replace '?' with NaN
data = data.replace('?', pd.NA)

# Step 2: Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
if 'income' in categorical_cols:
    categorical_cols.remove('income')  # Handle target separately

numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}\n")

# Step 3: Encode categorical features
label_encoders = {}
for col in categorical_cols:
    print(f"Encoding {col}...")
    le = LabelEncoder()
    # Fill NaN with a placeholder before encoding
    data[col] = data[col].fillna('missing')
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Step 4: Fill missing values in numerical columns
for col in numerical_cols:
    if data[col].isna().sum() > 0:
        median_val = data[col].median()
        data[col] = data[col].fillna(median_val)
        print(f"Filled {col} with median: {median_val}")

# Step 5: Encode target variable
print("\nEncoding target variable 'income'...")
if data['income'].dtype == 'object':
    le_target = LabelEncoder()
    data['income'] = le_target.fit_transform(data['income'].astype(str))
    print(f"Income classes: {le_target.classes_}")

# Step 6: Verify everything is numeric
print(f"\nFinal dtypes:\n{data.dtypes}")
print(f"\nAny remaining object columns? {data.select_dtypes(include=['object']).columns.tolist()}")

print("\nPreprocessing complete!")


In [None]:
# Cell 4: Train/Test Split
X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Cell 5: Train Decision Tree (ID3 = entropy)
dt_model = DecisionTreeClassifier(
    criterion='entropy',  # ID3 uses entropy!
    random_state=42,
    max_depth=10
)

print("Training Decision Tree (ID3)...")
dt_model.fit(X_train, y_train)
print("Training complete!")

In [None]:
# Cell 6: Evaluate Model
y_pred_dt = dt_model.predict(X_test)
report = classification_report(y_test, y_pred_dt)

print("Classification Report:")
print(report)

# Save report
with open("ID3_report.txt", "w") as f:
    f.write(report)
print("Report saved to ID3_report.txt")

In [None]:

# Cell 7: Preprocess Final Test Set
X_test_final_processed = X_test_final.copy()

for col in label_encoders:
    if col in X_test_final_processed.columns:
        le = label_encoders[col]
        X_test_final_processed[col] = X_test_final_processed[col].apply(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )

X_test_final_processed = X_test_final_processed.fillna(
    X_test_final_processed.median(numeric_only=True)
)

print("Final test set preprocessed!")

In [None]:
# Cell 8: Make and Save Predictions
predictions = dt_model.predict(X_test_final_processed)

pd.DataFrame(predictions).to_csv(
    'y_predict_ID3.csv', index=False, header=False
)

print(f"Predictions saved to y_predict_ID3.csv")
print(f"Number of predictions: {len(predictions)}")
print(f"Prediction distribution: {pd.Series(predictions).value_counts()}")