In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
# ------------------------------
# 2. Load Data
# ------------------------------
train_path = "/kaggle/input/ai-201-b-mse-2-aiml-c/train.csv"       # CHANGE AS NEEDED
test_path = "/kaggle/input/ai-201-b-mse-2-aiml-c/test.csv"
target_col = "NObeyesdad"      # CHANGE AS NEEDED
id_col = "id"                  # CHANGE AS NEEDED

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

In [None]:
# ------------------------------
# 3. Separate Input & Target
# ------------------------------
y = train[target_col]
X = train.drop(columns=[target_col])

In [None]:
# ------------------------------
# 4. Identify Column Types
# ------------------------------
cat_cols = [col for col in X.columns if X[col].dtype == "object"]
num_cols = [col for col in X.columns if X[col].dtype != "object"]

print("\nCategorical Columns:", cat_cols)
print("Numerical Columns:", num_cols)

In [None]:
# ------------------------------
# 5. Preprocessing Pipeline
# ------------------------------

# A) Numeric: fill missing with median + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# B) Categorical: fill missing with mode + one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformations
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [None]:
#Exploratory Data Analysis (Optional)
plt.figure(figsize=(10,5))
sns.countplot(data=train, x=target_col)
plt.xticks(rotation=45)
plt.show()

for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(train[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
#Outlier Analysis (Optional)
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=train[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
 #7. Correlation Heatmap (Optional)
corr_df = train[num_cols].copy()
for col in corr_df.columns:
    corr_df[col].fillna(corr_df[col].median(), inplace=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr_df.corr(), cmap='coolwarm')
plt.show()

In [None]:
6. Create Final Model Pipeline
# ------------------------------
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])

In [None]:
# ------------------------------
# 7. Train/Validation Split
# ------------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# ------------------------------
# 8. Train the Model
# ------------------------------
model.fit(X_train, y_train)

preds = model.predict(X_valid)
print("\nAccuracy:", accuracy_score(y_valid, preds))
print("\nClassification Report:\n", classification_report(y_valid, preds))

In [None]:
# ------------------------------
# 9. Hyperparameter Tuning
# ------------------------------
param_grid = {
    "clf__n_estimators": [200, 300, 500],
    "clf__max_depth": [10, 20, None],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2]
}

grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print("\nBest Params:", grid.best_params_)

best_model = grid.best_estimator_

In [None]:
# ------------------------------
# 10. Predict on Test Data (NO NANS!)
# ------------------------------
test_preds = best_model.predict(test)

# Encode target back if needed
# (Only if target was label-encoded earlier)
# test_preds = le.inverse_transform(test_preds)

In [None]:
# ------------------------------
# 11. Submission File
# ------------------------------
submission = pd.DataFrame({
    id_col: test_ids,
    target_col: test_preds
})

submission.to_csv("submission.csv", index=False)
print("\nsubmission.csv file created successfully!")