In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")
df.sample(10)

In [None]:
target = 'quality_grade'
df.shape

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace = True)
df.duplicated().sum()

In [None]:
df.isnull().sum().sort_values(ascending = False)

In [None]:
df.dropna(subset = target, inplace = True)
df.isnull().sum().sort_values(ascending = False)

In [None]:
df.nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder

X = df.drop(['id', target], axis = 1)
y = df[target]

le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
num_cols = X.select_dtypes(exclude = ['object']).columns.tolist()
cat_cols = X.select_dtypes(include = ['object']).columns.tolist()
print("Numerical Variables are: ", num_cols)
print("Categorical Variables are: ", cat_cols)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# for col in num_cols:
#     plt.figure(figsize = (10, 6))
#     plt.subplot(1, 2, 1)
#     sns.histplot(x = col, data = df, kde = True, bins = 30)
#     plt.subplot(1, 2, 2)
#     sns.boxplot(x = col, data = df)
#     plt.tight_layout()
#     plt.show()

In [None]:
# for col in cat_cols:
#     plt.figure(figsize = (10, 6))
#     sns.countplot(data = df, x = col, hue = target)
#     plt.title(f'{col} by {target}')
#     plt.xticks(rotation = 30)
#     plt.tight_layout()
#     plt.show()

In [None]:
# sns.countplot(x = target, data = df)

In [None]:
# numeric_data = df.select_dtypes(include = ['number'])
# plt.figure(figsize = (10,6))
# sns.heatmap(numeric_data.corr(), annot = True, cmap = 'coolwarm')
# plt.title("Feature Correlation Heatmap")
# plt.show()

In [None]:
# sns.pairplot(df, hue = target)
# plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    # ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder = 'drop')

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

model = Pipeline([
    ('preprocessor', preprocessor),
    # ("pca", PCA()),
    ('classifier', RandomForestClassifier(class_weight = 'balanced', random_state = 42, n_jobs = -1))
    # ('classifier', XGBClassifier(objective = 'multi:softprob', num_class = 10, eval_metric = 'mlogloss', tree_method = 'hist', random_state = 42))
    # ('classifier', KNeighborsClassifier(n_jobs = -1))
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [200, 300, 400],
    'classifier__max_depth': [5, 7, 10],
    'classifier__min_samples_split': [25, 50],
    'classifier__min_samples_leaf': [20, 30, 40],
    # 'classifier__max_features': ['sqrt', 'log2'],
    'classifier__bootstrap': [True]
}

# param_grid = {
#     'classifier__n_estimators': [200, 300, 500],
#     'classifier__max_depth': [3, 4, 5],
#     'classifier__learning_rate': [0.03, 0.05, 0.1],
#     'classifier__subsample': [0.7, 0.8],
#     'classifier__colsample_bytree': [0.7, 0.8],
#     'classifier__min_child_weight': [1, 3, 5],
#     'classifier__gamma': [0, 0.1, 0.2]
# }

# param_grid = {
#     'pca__n_components': [0.80, 0.85, 0.90, 0.95],
#     'classifier__n_neighbors': [15, 21, 31, 41],
#     'classifier__weights': ['distance'],
#     'classifier__metric': ['euclidean'],
#     'classifier__leaf_size': [30, 40, 50]
# }

grid_search = GridSearchCV(
    model,
    param_grid,
    cv = 5,
    scoring = 'neg_log_loss'   #accuracy, roc_auc_ovr, neg_log_loss
)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
model.set_params(**grid_search.best_params_)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

print("Training Metrics:")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_pred))
y_pred_proba = model.predict_proba(X_train)
print("Log Loss:", log_loss(y_train, y_pred_proba))
print("ROC AUC:", roc_auc_score(y_train, y_pred_proba, multi_class = 'ovr', average = 'macro'))

In [None]:
print("Testing Metrics:")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
y_pred_proba = model.predict_proba(X_test)
print("Log Loss:", log_loss(y_test, y_pred_proba))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba, multi_class = 'ovr', average = 'macro'))

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

print("CV Scores:")
cv_scores = cross_val_score(model, X_train, y_train, cv = 5, scoring = 'accuracy')
print("Accuracy:", cv_scores.mean())
cv_scores = -cross_val_score(model, X_train, y_train, cv = 5, scoring = 'neg_log_loss')
print("Log Loss:", cv_scores.mean())
cv_scores = cross_val_score(model, X_train, y_train, cv = 5, scoring = 'roc_auc_ovr')
print("ROC:", cv_scores.mean())

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model.classes_)
disp.plot(cmap = 'Blues', values_format = 'd')
plt.title("Confusion Matrix")
plt.show()

In [None]:
model.fit(X, y)

In [None]:
# test_df = pd.read_csv('')
# X_test = test_df.drop('id', axis = 1)
# y_pred = model.predict(X_test)

# submission_df = pd.DataFrame({'id': test_df['id'], target: y_pred})
# submission_df.to_csv('submission.csv', index = False)

In [None]:
# test_df = pd.read_csv('')
# X_test = test_df.drop('id', axis = 1)
# y_pred = model.predict(X_test)
# y_pred = le.inverse_transform(y_pred)

# submission_df = pd.DataFrame({'id': test_df['id'], target: y_pred})
# submission_df.to_csv('submission.csv', index = False)

In [None]:
# test_df = pd.read_csv('')
# X_test = test_df.drop('id', axis = 1)
# y_pred = model.predict_proba(X_test)

# submission_df = pd.DataFrame(y_pred, columns = model.classes_)
# submission_df.insert(0, 'id', test_df['id'])
# submission_df.to_csv('submission.csv', index = False)

In [197]:
test_df = pd.read_csv('/kaggle/input/mle-ese-mock/test (4).csv')
X_test = test_df.drop(['id'], axis = 1)
y_pred = model.predict_proba(X_test)
original_classes = le.inverse_transform(model.classes_)

submission_df = pd.DataFrame(y_pred, columns = original_classes)
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv('submission.csv', index = False)

In [194]:
# class_names = le.classes_
# submission_df = pd.DataFrame(y_pred, columns = [f"Status_{cls}" for cls in class_names])
# submission_df.insert(0, 'id', test_df['id'])
# submission_df.to_csv("submission.csv", index = False)
# submission_df.head()

Unnamed: 0,id,Status_Q10_waste,Status_Q1_premium_fresh,Status_Q2_fresh,Status_Q3_export_grade,Status_Q4_dessert,Status_Q5_juice_high,Status_Q6_juice_low,Status_Q7_processing,Status_Q8_local_sale,Status_Q9_feed
0,0,0.000357,0.09174,0.192825,0.240597,0.259833,0.137351,0.055094,0.015954,0.00481,0.00144
1,1,0.516227,0.0,0.000121,0.0002,0.002032,0.007494,0.021413,0.063426,0.122926,0.266161
2,2,0.000166,0.507677,0.273544,0.135087,0.048743,0.025482,0.00637,0.00226,0.000589,8.3e-05
3,3,0.001351,0.081527,0.15891,0.29723,0.204088,0.163478,0.060497,0.023032,0.008644,0.001243
4,4,0.095878,0.000237,0.000805,0.003636,0.011549,0.042565,0.139614,0.258972,0.274392,0.172354


In [195]:
df = pd.read_csv('/kaggle/input/mle-ese-mock/submission (6).csv')
df.head()

Unnamed: 0,id,Q1_premium_fresh,Q2_fresh,Q3_export_grade,Q4_dessert,Q5_juice_high,Q6_juice_low,Q7_processing,Q8_local_sale,Q9_feed,Q10_waste
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
cols = list(submission_df.columns)
if 'Q10_waste' in cols:
    cols.remove('Q10_waste')
cols.append('Q10_waste')
submission_df = submission_df[cols]
submission_df.head()

Unnamed: 0,id,Q1_premium_fresh,Q2_fresh,Q3_export_grade,Q4_dessert,Q5_juice_high,Q6_juice_low,Q7_processing,Q8_local_sale,Q9_feed,Q10_waste
0,0,0.09174,0.192825,0.240597,0.259833,0.137351,0.055094,0.015954,0.00481,0.00144,0.000357
1,1,0.0,0.000121,0.0002,0.002032,0.007494,0.021413,0.063426,0.122926,0.266161,0.516227
2,2,0.507677,0.273544,0.135087,0.048743,0.025482,0.00637,0.00226,0.000589,8.3e-05,0.000166
3,3,0.081527,0.15891,0.29723,0.204088,0.163478,0.060497,0.023032,0.008644,0.001243,0.001351
4,4,0.000237,0.000805,0.003636,0.011549,0.042565,0.139614,0.258972,0.274392,0.172354,0.095878
