In [None]:
#For accuracy

In [None]:
#Imports
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
#train and test read
train=pd.read_csv("/kaggle/input/mse2-makeup/train.csv")
test=pd.read_csv("/kaggle/input/mse2-makeup/test.csv")
#some checks
train.shape          # rows, columns
train.columns        # column names
train.head()         # first few rows
train.tail()         # last few rows
train.isnull().sum()
train.info()
#test item drop
test_id=test['id']
test=test.drop(columns=['id'])
#train item drop
train=train.drop(columns=['id'])
X=train.drop(columns=['Status'])
y=train['Status']
#remove NaN from y
train = train.dropna(subset=['Status'])#target
#X and y drop
X=train.drop(columns=["fruit_name"])#target
y=train["fruit_name"]
#train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
#features categories
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns
#feature pipeline
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])
#Pipeline for preprocessing
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])
#model selection
model = GradientBoostingClassifier(
    n_estimators=920,
    learning_rate=0.02,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.7,
    random_state=42
)
#-----------------------------------------------
#Grid search
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)

pipe = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

param_grid = {
    'model__n_estimators': [400, 600, 800],
    'model__learning_rate': [0.01, 0.03, 0.05],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.6, 0.8, 1.0]
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='neg_log_loss',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best Log Loss:", -grid.best_score_)
print("Best Params:", grid.best_params_)
#-----------------------------------------------
#Randomised search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint, uniform

model = GradientBoostingClassifier(random_state=42)

pipe = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

param_dist = {
    'model__n_estimators': randint(300, 1200),
    'model__learning_rate': uniform(0.01, 0.05),
    'model__max_depth': randint(3, 6),
    'model__subsample': uniform(0.6, 0.4)
}

random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_log_loss',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train, y_train)

print("Best Log Loss:", -random_search.best_score_)
print("Best Params:", random_search.best_params_)

#-----------------------------------------------
#pipeline for model
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessing),
    ('model',model)
])
#fitting the model
#Agar grud search ker liya hai to grid.fit karna na ki pipeline.fit
pipeline.fit(X_train,y_train_enc)
#predicting on X_test
y_proba = pipeline.predict_proba(X_test)  # shape: (n_samples, n_classes)
#predicting on test
y_final=pipeline.predict_proba(test)
#Submission
submission = pd.DataFrame(
    test_proba,
    columns=class_names
)
submission.to_csv('submission.csv',index=False)

In [None]:
#accuracy
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    log_loss,
    roc_auc_score,
    top_k_accuracy_score,
    matthews_corrcoef
)
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred, average='macro')
log_loss(y_test, y_proba)
confusion_matrix(y_test, y_pred)

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)


In [None]:
#plotting and visualization
#A. DATA EXPLORATION (Before training)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
sns.heatmap(train.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()
---------------------------------------------
train['fruit_name'].value_counts().plot(
    kind='bar', figsize=(8,4), title='Target Distribution'
)
plt.show()
---------------------------------------------
train.select_dtypes(include=['int64','float64']).hist(
    figsize=(15,10), bins=30
)
plt.suptitle("Numeric Feature Distributions")
plt.show()
----------------------------------------------
plt.figure(figsize=(15,6))
sns.boxplot(data=train.select_dtypes(include=['int64','float64']))
plt.xticks(rotation=90)
plt.title("Outlier Detection")
plt.show()
----------------------------------------------
plt.figure(figsize=(12,8))
sns.heatmap(
    train.corr(numeric_only=True),
    cmap='coolwarm',
    center=0
)
plt.title("Feature Correlation")
plt.show()
----------------------------------------------
plt.figure(figsize=(12,8))
sns.heatmap(
    train.corr(numeric_only=True),
    cmap='coolwarm',
    center=0
)
plt.title("Feature Correlation")
plt.show()
----------------------------------------------
After fitting
from sklearn.metrics import confusion_matrix
classes = pipeline.named_steps['model'].classes_
# y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=classes,
            yticklabels=classes)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
-----------------------------------------------
# Scatter plot
plt.figure(figsize=(6,4))
sns.scatterplot(
    x=train[num_cols_all[0]],
    y=train[num_cols_all[1]],
    hue=train[TARGET_COL],
    legend=False
)
plt.title("Scatter Plot of Two Numeric Features")
plt.show()
------------------------------------------------
plt.figure(figsize=(10,6))

corr = train[num_cols_all[:10]].corr()

sns.heatmap(
    corr,
    cmap="coolwarm",
    annot=True,
    fmt=".2f"
)

plt.title("Correlation Heatmap (Numeric Features Only)")
plt.show()
-------------------------------------------------
ConfusionMatrixDisplay.from_predictions(
    y_val, y_pred_rf, display_labels=le.classes_, cmap="Blues"
)
plt.title("RandomForest - Confusion Matrix")
plt.show()
--------------------------------------------------
plt.hist(y_proba.max(axis=1), bins=20)
plt.xlabel("Max Predicted Probability")
plt.title("Prediction Confidence Distribution")
plt.show()
--------------------------------------------------


log loss


from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(
    (y_test == classes[0]).astype(int),
    y_proba[:, 0],
    n_bins=10
)

plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("Predicted Probability")
plt.ylabel("True Probability")
plt.title("Calibration Curve (Class 0)")
plt.show()
---------------------------------------------------
plt.figure(figsize=(6,6))
plt.plot(prob_pred, prob_true, label='Before')
plt.plot([0,1],[0,1],'--')
plt.legend()
plt.title("Calibration Comparison")
plt.show()
---------------------------------------------------
#submission quality


plt.hist(submission.drop(columns=['id']).sum(axis=1))
plt.title("Probability Sum per Row (Should be ~1)")
plt.show()


submission.drop(columns=['id']).mean().plot(
    kind='bar', title="Mean Predicted Probabilities"
)
plt.show()


In [None]:
#For Log loss or neg_log_los

In [None]:
#Imports
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
#train and test read
train=pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test=pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")
#some checks
train.shape          # rows, columns
train.columns        # column names
train.head()         # first few rows
train.tail()         # last few rows
train.isnull().sum()
train.info()
#test item drop
test_id=test['id']
test=test.drop(columns=['id'])
#train item drop
train=train.drop(columns=['id'])
X=train.drop(columns=['Status'])
y=train['Status']
#remove NaN from y
train = train.dropna(subset=['fruit_name'])
#train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
#features categories
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns
#feature pipeline
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])
#Pipeline for preprocessing
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])
#model selection
model = GradientBoostingClassifier(
    n_estimators=920,
    learning_rate=0.02,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.7,
    random_state=42
)
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(
    max_iter=600,
    learning_rate=0.03,
    max_depth=6,
    min_samples_leaf=20,
    l2_regularization=0.1,
    max_bins=255,
    random_state=42
)
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(
    n_estimators=900,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42,
    n_jobs=-1
)
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=800,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(
    n_estimators=600,
    learning_rate=0.03,
    algorithm='SAMME.R',
    random_state=42
)
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)
from lightgbm import LGBMClassifier

model = LGBMClassifier(
    n_estimators=900,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multiclass',
    random_state=42,
    n_jobs=-1
)
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=900,
    learning_rate=0.03,
    depth=6,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=False
)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=1.0,
    max_iter=2000,
    n_jobs=-1
)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(
    n_neighbors=15,
    weights='distance',
    metric='minkowski'
)

#pipeline for model
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessing),
    ('model',model)
])
#label encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # fit on train
y_test_enc = le.transform(y_test)        # transform test
#fitting the model
pipeline.fit(X_train,y_train_enc)
#predicting on X_test
y_proba = pipeline.predict_proba(X_test)  # shape: (n_samples, n_classes)
#predicting on test
y_final=pipeline.predict_proba(test)
#re labeleing and submission
class_names = le.classes_  # use label encoder mapping
submission = pd.DataFrame(y_final, columns=[f"Status_{cls}" for cls in class_names])
submission.insert(0, 'id', test_id)
submission.to_csv("submission4.csv", index=False)
print("\nâœ… Submission file created successfully!")
print(submission.head())