In [None]:
#ONLY NEED TO CHANGE
# 1️⃣ Model import
# 2️⃣ Model definition + training
# 3️⃣ Evaluation metric


# Predict number/value	---- Regression
# Predict Yes / No	    ---- Logistic
# Predict 3+ classes	  ---- RF / DT
# “Leaderboard score matters”	--- XGBoost
# Dataset is very small	----  KNN
# Too many features	    ----  PCA + Logistic
# Classes imbalanced	  ----  RF + F1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

#Random Forest(NEED CHANGE)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# FOR
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# ``` |
# | Binary classification |
# ```python
# from sklearn.linear_model import LogisticRegression
# ``` |
# | Multiclass |
# ```python
# from sklearn.tree import DecisionTreeClassifier
# # OR keep RandomForestClassifier
# ``` |
# | Best Kaggle score |
# ```python
#from xgboost import XGBClassifier
# ``` |
# | Small dataset |
# ```python
# from sklearn.neighbors import KNeighborsClassifier
# ``` |
# | High dimension |
# ```python
# from sklearn.decomposition import PCA
# from sklearn.linear_model import LogisticRegression
# ``` |

In [None]:
# -------- CONFIG (CHANGE THESE) --------
TRAIN_PATH = "/kaggle/input/mse-2-ai-201-b-aiml-c/train.csv"
TEST_PATH  = "/kaggle/input/mse-2-ai-201-b-aiml-c/test.csv"
TARGET_COL = "Class"
ID_COL     = "id"



train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

In [None]:
print("\n===== SHAPE =====")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
print("\n===== COLUMNS =====")
print(train.columns)

print("\n===== DATA TYPES =====")
print(train.dtypes)

print("\n===== FIRST 10 ROWS =====")
print(train.head(10))

print("\n===== DESCRIPTIVE STATS =====")
print(train.describe(include="all"))

print("\n===== MISSING VALUES =====")
print(train.isnull().sum())

print("\n===== TARGET VALUE COUNTS =====")
print(train[TARGET_COL].value_counts())

In [None]:

#                 VISUALIZATION


# Missing value heatmap
plt.figure(figsize=(10,5))
sns.heatmap(train.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

# Target distribution
plt.figure(figsize=(6,4))
sns.countplot(x=train[TARGET_COL])
plt.title("Target Distribution")
plt.show()


In [None]:
# Categorical countplots
categorical_cols_full = train.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols_full:
    #, ID_COL
    if col not in [TARGET_COL, ID_COL]:
        plt.figure(figsize=(6,4))
        sns.countplot(x=train[col])
        plt.title(f"Countplot of {col}")
        plt.show()

In [None]:
# Boxplot for numeric columns
numeric_cols_full = train.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(14,6))
sns.boxplot(data=train[numeric_cols_full], orient="h")
plt.title("Numeric Feature Boxplots")
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(train[numeric_cols_full].corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#                 DATA CLEANING


train = train.drop_duplicates()
test  = test.drop_duplicates()

train = train.dropna(subset=[TARGET_COL])

X = train.drop([TARGET_COL,ID_COL], axis=1)
y = train[TARGET_COL]

# when  Your target column has strings ('A', 'B', maybe 'C')
# XGBoost expects numbers (0, 1, 2, ...)
# from sklearn.preprocessing import LabelEncoder

# target_encoder = LabelEncoder()
# y = target_encoder.fit_transform(y)

# #as multiclass only for xg boost
# num_classes = len(np.unique(y))

#     ---------------------------------------------------------------

test_ids = test[ID_COL]
X_test   = test.drop(ID_COL, axis=1)

In [None]:
# Outlier removal (IQR)
numeric_cols = X.select_dtypes(include=[np.number]).columns
Q1 = X[numeric_cols].quantile(0.25)
Q3 = X[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

mask = ~(
    (X[numeric_cols] < (Q1 - 1.5*IQR)) |
    (X[numeric_cols] > (Q3 + 1.5*IQR))
).any(axis=1)

X = X[mask]
y = y[mask]


In [None]:
# Safe label encoding
categorical_cols = X.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col]      = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

X      = X.fillna(X.median())
X_test = X_test.fillna(X.median())

In [None]:
#                 TRAIN-TEST SPLIT


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#                 HYPERPARAMETER TUNING


params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}





In [None]:

#CHANGE ACCORDINGLy -GRID+GRID,MODEL.FIT

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

#REGREESION

# model = LinearRegression()
# model.fit(X_train, y_train)
# best_model = model


#binary classification

# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)
# best_model = model


#Decision tree

# model = DecisionTreeClassifier(max_depth=10, random_state=42)
# model.fit(X_train, y_train)
# best_model = model



#XGBOOST
# model = XGBClassifier(
#     n_estimators=300,
#     learning_rate=0.05,
#     max_depth=6,
#     objective="multi:softprob",
#     num_class=num_classes,
#     eval_metric="mlogloss",
#     random_state=42
# )

# model.fit(X_train, y_train)
# best_model = model


#KNN

# model = KNeighborsClassifier(n_neighbors=5)
# model.fit(X_train, y_train)
# best_model = model




#PCA +loistics
# pca = PCA(n_components=0.95)
# X_train = pca.fit_transform(X_train)
# X_val   = pca.transform(X_val)
# X_test  = pca.transform(X_test)

# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)
# best_model = model



y_pred = best_model.predict(X_val)

#for xg boost
#y_pred = np.argmax(best_model.predict(X_val), axis=1)
#-----------------




In [None]:
print("\n===== VALIDATION RESULTS =====")
print("\nAccuracy:", accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))



best_model.fit(X, y)


In [None]:
# ============================================================
#     1) SUBMISSION WITH LABEL PREDICTIONS (C, CL, D)
# ============================================================

label_preds = best_model.predict(X_test)

#for xg boost
# label_preds = np.argmax(best_model.predict(X_test), axis=1)
# label_preds = target_encoder.inverse_transform(label_preds)


submission_labels = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: label_preds
})

submission_labels.to_csv("submission_labels.csv", index=False)
print("\nSaved: submission_labels.csv")

# # ============================================================
# #     2) SUBMISSION WITH PROBABILITIES (Status_C, Status_CL, Status_D)
# # ============================================================

# probs = best_model.predict_proba(X_test)

# classes = list(best_model.classes_)  # e.g. ['C','CL','D']
# required = ["C", "CL", "D"]          # match sample submission

# final_probs = []
# for cls in required:
#     idx = classes.index(cls)
#     final_probs.append(probs[:, idx])

# submission_probs = pd.DataFrame({
#     ID_COL: test_ids,
#     "Status_C":  final_probs[0],
#     "Status_CL": final_probs[1],
#     "Status_D":  final_probs[2]
# })

# submission_probs.to_csv("submission_probabilities.csv", index=False)
# print("\nSaved: submission_probabilities.csv")

