In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate
models = [
    
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier())
]

# Function to evaluate models and print performance metrics



from sklearn.metrics import classification_report

# Modified function to evaluate models and print performance metrics for each class
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Classification report for per-class metrics
    class_report = classification_report(y_test, y_pred)

    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Weighted Precision: {precision:.4f}")
    print(f"Weighted Recall: {recall:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("Classification Report (per class metrics):\n", class_report)
    print("-" * 50)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)



Model: Decision Tree
Accuracy: 0.7414
Weighted Precision: 0.7480
Weighted Recall: 0.7414
Weighted F1 Score: 0.7443
ROC AUC: Not applicable
Confusion Matrix:
 [[1277  196   48]
 [ 175  175   24]
 [  38   43   50]]
Classification Report (per class metrics):
               precision    recall  f1-score   support

           1       0.86      0.84      0.85      1521
           2       0.42      0.47      0.44       374
           3       0.41      0.38      0.40       131

    accuracy                           0.74      2026
   macro avg       0.56      0.56      0.56      2026
weighted avg       0.75      0.74      0.74      2026

--------------------------------------------------
Model: Random Forest
Accuracy: 0.8159
Weighted Precision: 0.7993
Weighted Recall: 0.8159
Weighted F1 Score: 0.7861
ROC AUC: Not applicable
Confusion Matrix:
 [[1487   24   10]
 [ 237  117   20]
 [  62   20   49]]
Classification Report (per class metrics):
               precision    recall  f1-score   support


In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)
from sklearn.ensemble import (GradientBoostingClassifier, AdaBoostClassifier,
                              BaggingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Ensure DecisionTreeClassifier is imported
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import (QuadraticDiscriminantAnalysis,
                                            LinearDiscriminantAnalysis)
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate
models = [
    ("Bagging Classifier", BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50)),  # Updated Bagging Classifier
    ("Extra Trees Classifier", ExtraTreesClassifier()),  # Extra Trees Classifier
    ("XGBoost", xgb.XGBClassifier()),  # XGBoost
    ("LightGBM", lgb.LGBMClassifier()),  # LightGBM
]

# Function to evaluate models and print performance metrics for each class
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Classification report for per-class metrics
    class_report = classification_report(y_test, y_pred)

    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Weighted Precision: {precision:.4f}")
    print(f"Weighted Recall: {recall:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("Classification Report (per class metrics):\n", class_report)
    print("-" * 50)

# Convert sparse matrices to dense arrays
X_train_scaled_dense = X_train_scaled.toarray() if hasattr(X_train_scaled, "toarray") else X_train_scaled
X_test_scaled_dense = X_test_scaled.toarray() if hasattr(X_test_scaled, "toarray") else X_test_scaled

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled_dense, X_test_scaled_dense, y_train, y_test)


Model: Bagging Classifier
Accuracy: 0.8228
Weighted Precision: 0.8047
Weighted Recall: 0.8228
Weighted F1 Score: 0.8029
ROC AUC: Not applicable
Confusion Matrix:
 [[1464   47   10]
 [ 208  148   18]
 [  50   26   55]]
Classification Report (per class metrics):
               precision    recall  f1-score   support

           1       0.85      0.96      0.90      1521
           2       0.67      0.40      0.50       374
           3       0.66      0.42      0.51       131

    accuracy                           0.82      2026
   macro avg       0.73      0.59      0.64      2026
weighted avg       0.80      0.82      0.80      2026

--------------------------------------------------
Model: Extra Trees Classifier
Accuracy: 0.7922
Weighted Precision: 0.7676
Weighted Recall: 0.7922
Weighted F1 Score: 0.7461
ROC AUC: Not applicable
Confusion Matrix:
 [[1494   22    5]
 [ 287   79    8]
 [  76   23   32]]
Classification Report (per class metrics):
               precision    recall  f1-sc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# Define the models you want to evaluate
models = [
    ("CatBoost Classifier", CatBoostClassifier(verbose=0)),  # CatBoost
    ("Nearest Centroid Classifier", NearestCentroid()),  # Nearest Centroid
    ("Histogram-Based Gradient Boosting", HistGradientBoostingClassifier())  # Histogram-based GB
    
]
# Function to evaluate the model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For multi-class, predict_proba will return probabilities for all classes
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    
    # Print classification report (precision, recall, f1 for each class)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Calculate ROC AUC for multi-class
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    print("="*40)

# Assuming you have your data prepared and scaled
# Example of scaling (make sure you do this before fitting the models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled_dense)
X_test_scaled = scaler.transform(X_test_scaled_dense)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: CatBoost Classifier
Classification Report:
              precision    recall  f1-score   support

           1       0.85      0.97      0.91      1521
           2       0.71      0.40      0.51       374
           3       0.62      0.40      0.49       131

    accuracy                           0.83      2026
   macro avg       0.73      0.59      0.63      2026
weighted avg       0.81      0.83      0.81      2026

Error calculating ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'
Model: Nearest Centroid Classifier
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.83      0.50      0.62      1521
           2       0.25      0.32      0.28       374
           3       0.18      0.57      0.28       131

    accuracy                           0.47      2026
   macro avg       0.32      0.35      0.29      2026
weighted avg       0.68     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Histogram-Based Gradient Boosting
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.86      0.96      0.90      1521
           2       0.70      0.41      0.52       374
           3       0.54      0.44      0.48       131

    accuracy                           0.82      2026
   macro avg       0.52      0.45      0.48      2026
weighted avg       0.81      0.82      0.81      2026

Error calculating ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression  # Logistic Regression for classification
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression(max_iter=500))  # Logistic Regression for classification
]

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Print the classification report
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))

    # Compute ROC AUC for multi-class
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", auc_score)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: Not applicable")
    
    print("=" * 40)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: Logistic Regression
              precision    recall  f1-score   support

           1       0.75      1.00      0.86      1521
           2       0.40      0.01      0.02       374
           3       0.25      0.02      0.03       131

    accuracy                           0.75      2026
   macro avg       0.47      0.34      0.30      2026
weighted avg       0.66      0.75      0.65      2026

Error calculating ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import Lasso
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate (Lasso Regression in this case)
models = [
    ("Lasso Regression", Lasso(alpha=0.1, max_iter=500))  # Lasso for classification (use thresholding)
]

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Threshold the continuous predictions for binary classification
    y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

    # Print the classification report
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred_binary))

    # Compute ROC AUC for binary classification
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]  # For binary classification
        roc_auc = roc_auc_score(y_test, y_proba)
        print(f"ROC AUC Score: {roc_auc:.4f}")
    else:
        print("ROC AUC Score: Not applicable")
    
    print("=" * 40)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: Lasso Regression
              precision    recall  f1-score   support

           1       0.75      1.00      0.86      1521
           2       0.00      0.00      0.00       374
           3       0.00      0.00      0.00       131

    accuracy                           0.75      2026
   macro avg       0.25      0.33      0.29      2026
weighted avg       0.56      0.75      0.64      2026

ROC AUC Score: Not applicable


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
SMOTE

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import xgboost as xgb

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# First set of 5 models
models_1 = [
   
    ("Decision Tree", 
     Pipeline([('scaler', scaler), 
               ('classifier', DecisionTreeClassifier(class_weight='balanced'))])),
    
    ("Random Forest", 
     Pipeline([('scaler', scaler), 
               ('classifier', RandomForestClassifier(random_state=42))])),
    
    
    
]

# Loop through models and evaluate
for name, model in models_1:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Decision Tree
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.91      0.86      0.88     15189
           2       0.54      0.62      0.58      3751
           3       0.53      0.62      0.57      1314

    accuracy                           0.80     20257
   macro avg       0.49      0.52      0.51     20257
weighted avg       0.81      0.80      0.81     20257

ROC AUC Score: 0.7086803483257109
Confusion Matrix:
 [[    0     2     1     0]
 [    6 13101  1676   406]
 [    0  1131  2307   313]
 [    1   218   286   809]]
Model: Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.91      0.95      0.93     15189
           2       0.76      0.59      0.66      3751
           3       0.67      0.75      0.71      1314

    accuracy                           0.87     20257
   macro avg       0.58      0.57  

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Second set of 5 models
models_2 = [
   
    
    ("Bagging Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', BaggingClassifier(n_estimators=50, random_state=42))])),
    
    ("Extra Trees Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', ExtraTreesClassifier(random_state=42))]))
]

# Loop through models and evaluate
for name, model in models_2:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Bagging Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.91      0.93      0.92     15189
           2       0.72      0.61      0.66      3751
           3       0.61      0.71      0.66      1314

    accuracy                           0.86     20257
   macro avg       0.56      0.56      0.56     20257
weighted avg       0.86      0.86      0.86     20257

ROC AUC Score: 0.8659973669425541
Confusion Matrix:
 [[    0     3     0     0]
 [    5 14189   691   304]
 [    1  1182  2271   297]
 [    1   184   191   938]]
Model: Extra Trees Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.89      0.95      0.92     15189
           2       0.72      0.51      0.59      3751
           3       0.68      0.72      0.70      1314

    accuracy                           0.85     20257
   macro avg       0.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Model list with only the specified classifiers
models = [
   
    ("Histogram-Based Gradient Boosting", 
     Pipeline([('scaler', scaler), 
               ('classifier', HistGradientBoostingClassifier(random_state=42))])),
    
    ("XGBoost", 
     Pipeline([('scaler', scaler), 
               ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])),
    
    ("LightGBM", 
     Pipeline([('scaler', scaler), 
               ('classifier', lgb.LGBMClassifier(random_state=42))])),
    
    ("CatBoost Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', CatBoostClassifier(silent=True))]))
]

# Loop through models and evaluate
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)




Model: Histogram-Based Gradient Boosting
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.90      0.93      0.91     15189
           2       0.72      0.49      0.59      3751
           3       0.55      0.78      0.65      1314

    accuracy                           0.84     20257
   macro avg       0.54      0.55      0.54     20257
weighted avg       0.84      0.84      0.83     20257

ROC AUC Score: 0.8473610782478216
Confusion Matrix:
 [[    0     3     0     0]
 [   16 14133   637   403]
 [    6  1455  1851   439]
 [    1   195    87  1031]]


Parameters: { "use_label_encoder" } are not used.



Model: XGBoost
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.90      0.95      0.92     15189
           2       0.77      0.54      0.63      3751
           3       0.61      0.79      0.69      1314

    accuracy                           0.86     20257
   macro avg       0.57      0.57      0.56     20257
weighted avg       0.86      0.86      0.85     20257

ROC AUC Score: 0.9335244816578858
Confusion Matrix:
 [[    0     1     1     1]
 [   15 14358   520   296]
 [    4  1360  2026   361]
 [    1   191    87  1035]]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3067
[LightGBM] [Info] Number of data points in the train set: 243764, number of used features: 15
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score 

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import NearestCentroid
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Nearest Centroid Classifier with scaling
nearest_centroid_model = Pipeline([
    ('scaler', scaler),
    ('classifier', NearestCentroid())
])

# Evaluate Nearest Centroid Classifier
evaluate_model("Nearest Centroid Classifier", nearest_centroid_model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Nearest Centroid Classifier
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         3
           1       0.84      0.51      0.63     15189
           2       0.24      0.30      0.27      3751
           3       0.17      0.52      0.26      1314

    accuracy                           0.47     20257
   macro avg       0.31      0.58      0.29     20257
weighted avg       0.69      0.47      0.54     20257

ROC AUC Score: N/A
Confusion Matrix:
 [[   3    0    0    0]
 [1856 7710 3331 2292]
 [ 492 1200 1110  949]
 [ 209  243  183  679]]


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import RidgeClassifier, Lasso
from imblearn.over_sampling import SMOTE
from scipy import sparse
import numpy as np

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For Lasso, convert continuous predictions to binary predictions
    if isinstance(model, Lasso):
        y_pred = (y_pred > 0.5).astype(int)  # Convert continuous output to binary using threshold
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba)
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# List of models to evaluate
models = [
    
    ("Lasso Regression", Lasso(max_iter=500))  # Note: Lasso is typically not for classification
]

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Lasso Regression
              precision    recall  f1-score   support

           0       0.01      0.67      0.01         3
           1       0.75      0.99      0.85     15189
           2       0.00      0.00      0.00      3751
           3       0.00      0.00      0.00      1314

    accuracy                           0.74     20257
   macro avg       0.19      0.41      0.22     20257
weighted avg       0.56      0.74      0.64     20257

ROC AUC Score: N/A
Confusion Matrix:
 [[    2     1     0     0]
 [  208 14981     0     0]
 [   47  3704     0     0]
 [   12  1302     0     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
