In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset (replace 'path_to_file.csv' with the actual file path)
file_path = 'updated_food_inspection_cleaned.csv'
food_inspection_data = pd.read_csv(file_path)

# Define the features (X) and target (y)
X = food_inspection_data.drop(columns=['Risk'])
y = food_inspection_data['Risk']

# Split the data into training and testing sets with an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the percentage of training and testing data
train_percent = (len(X_train) / len(food_inspection_data)) * 100
test_percent = (len(X_test) / len(food_inspection_data)) * 100

train_percent, test_percent


(79.99960506699051, 20.00039493300949)

In [5]:
# Redefine the target variable as 'Risk' and split the data accordingly

# Define the features (X) and target (y)
X = food_inspection_data.drop(columns=['Risk'])
y = food_inspection_data['Risk']

# Split the data into training and testing sets with an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the split datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((81026, 16), (20257, 16), (81026,), (20257,))

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression(max_iter=500)),  # Increased max_iter
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier())
]

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("-" * 50)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.7502
Precision: 0.6561
Recall: 0.7502
F1 Score: 0.6499
ROC AUC: Not applicable
Confusion Matrix:
 [[1514    4    3]
 [ 367    4    3]
 [ 127    2    2]]
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.7394
Precision: 0.7458
Recall: 0.7394
F1 Score: 0.7422
ROC AUC: Not applicable
Confusion Matrix:
 [[1278  203   40]
 [ 178  169   27]
 [  37   43   51]]
--------------------------------------------------
Model: Random Forest
Accuracy: 0.8105
Precision: 0.7924
Recall: 0.8105
F1 Score: 0.7781
ROC AUC: Not applicable
Confusion Matrix:
 [[1487   22   12]
 [ 245  111   18]
 [  66   21   44]]
--------------------------------------------------


In [1]:
# Import necessary libraries
from sklearn.ensemble import (GradientBoostingClassifier, AdaBoostClassifier,
                              BaggingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier  # Ensure DecisionTreeClassifier is imported
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import (QuadraticDiscriminantAnalysis,
                                            LinearDiscriminantAnalysis)
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd  # Ensure pandas is imported

# Load your dataset
# X, y = load_your_data()  # Replace this line with your dataset loading process

# Example for generating random data (for illustration)
# X = np.random.rand(100, 10)  # Example feature matrix with 100 samples and 10 features
# y = np.random.randint(0, 2, 100)  # Example target vector (binary classification)

# Split your data into training and testing sets
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
# X_sparse = scipy.sparse.csr_matrix(X_sampled)  # If you're working with sparse matrices

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply scaling if needed (StandardScaler is commonly used)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of models to evaluate
models = [
    ("Gradient Boosting", GradientBoostingClassifier()),  # Gradient Boosting
    ("Support Vector Machine", SVC(probability=True)),  # SVM with probability
    ("K-Nearest Neighbors", KNeighborsClassifier()),  # K-Nearest Neighbors
    ("Naive Bayes", GaussianNB()),  # Naive Bayes
    ("AdaBoost", AdaBoostClassifier()),  # AdaBoost
    ("Bagging Classifier", BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50)),  # Updated Bagging Classifier
    ("Extra Trees Classifier", ExtraTreesClassifier()),  # Extra Trees Classifier
    ("XGBoost", xgb.XGBClassifier()),  # XGBoost
    ("LightGBM", lgb.LGBMClassifier()),  # LightGBM
]

# Function to evaluate models and print performance metrics
from sklearn.metrics import classification_report

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print out the target classes
    print(f"Target Classes: {sorted(set(y_train))}")  # or use set(y_test) if you prefer
    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    
    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)



Target Classes: [0, 1, 2, 3]
Model: Gradient Boosting
Accuracy: 0.8301
Precision: 0.8256
Recall: 0.8301
F1 Score: 0.8026
ROC AUC: 0.9045
Confusion Matrix:
 [[    0     3     0     0]
 [   13 14923   152   101]
 [    5  2371  1174   201]
 [    3   470   122   719]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.84      0.98      0.91     15189
           2       0.81      0.31      0.45      3751
           3       0.70      0.55      0.62      1314

    accuracy                           0.83     20257
   macro avg       0.59      0.46      0.49     20257
weighted avg       0.83      0.83      0.80     20257

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Classes: [0, 1, 2, 3]
Model: Support Vector Machine
Accuracy: 0.7720
Precision: 0.7400
Recall: 0.7720
F1 Score: 0.6976
ROC AUC: 0.8029
Confusion Matrix:
 [[    0     3     0     0]
 [    0 15052    46    91]
 [    0  3396   179   176]
 [    0   845    61   408]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.78      0.99      0.87     15189
           2       0.63      0.05      0.09      3751
           3       0.60      0.31      0.41      1314

    accuracy                           0.77     20257
   macro avg       0.50      0.34      0.34     20257
weighted avg       0.74      0.77      0.70     20257

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Classes: [0, 1, 2, 3]
Model: K-Nearest Neighbors
Accuracy: 0.7746
Precision: 0.7380
Recall: 0.7746
F1 Score: 0.7432
ROC AUC: 0.7433
Confusion Matrix:
 [[    0     3     0     0]
 [    0 14327   729   133]
 [    1  2675   901   174]
 [    0   604   246   464]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.81      0.94      0.87     15189
           2       0.48      0.24      0.32      3751
           3       0.60      0.35      0.45      1314

    accuracy                           0.77     20257
   macro avg       0.47      0.38      0.41     20257
weighted avg       0.74      0.77      0.74     20257

--------------------------------------------------
Target Classes: [0, 1, 2, 3]
Model: Naive Bayes
Accuracy: 0.6402
Precision: 0.6520
Recall: 0.6402
F1 Score: 0.6267
ROC AUC: 0.6978
Confusion Matrix:
 [[    3     0     0     0]
 [ 1885 12666   612    26]
 [  638  2808   285 



Target Classes: [0, 1, 2, 3]
Model: AdaBoost
Accuracy: 0.7399
Precision: 0.6780
Recall: 0.7399
F1 Score: 0.6983
ROC AUC: 0.6608
Confusion Matrix:
 [[    0     3     0     0]
 [    3 14214   695   277]
 [    0  2947   570   234]
 [    0   584   526   204]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.80      0.94      0.86     15189
           2       0.32      0.15      0.21      3751
           3       0.29      0.16      0.20      1314

    accuracy                           0.74     20257
   macro avg       0.35      0.31      0.32     20257
weighted avg       0.68      0.74      0.70     20257

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Classes: [0, 1, 2, 3]
Model: Bagging Classifier
Accuracy: 0.9098
Precision: 0.9081
Recall: 0.9098
F1 Score: 0.9050
ROC AUC: 0.8935
Confusion Matrix:
 [[    0     3     0     0]
 [    0 14942   161    86]
 [    0  1085  2544   122]
 [    0   249   121   944]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.92      0.98      0.95     15189
           2       0.90      0.68      0.77      3751
           3       0.82      0.72      0.77      1314

    accuracy                           0.91     20257
   macro avg       0.66      0.60      0.62     20257
weighted avg       0.91      0.91      0.90     20257

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Classes: [0, 1, 2, 3]
Model: Extra Trees Classifier
Accuracy: 0.8833
Precision: 0.8854
Recall: 0.8833
F1 Score: 0.8715
ROC AUC: 0.9745
Confusion Matrix:
 [[    0     3     0     0]
 [    0 15046    73    70]
 [    0  1686  1969    96]
 [    0   327   108   879]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.88      0.99      0.93     15189
           2       0.92      0.52      0.67      3751
           3       0.84      0.67      0.75      1314

    accuracy                           0.88     20257
   macro avg       0.66      0.55      0.59     20257
weighted avg       0.89      0.88      0.87     20257

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Classes: [0, 1, 2, 3]
Model: XGBoost
Accuracy: 0.8939
Precision: 0.8924
Recall: 0.8939
F1 Score: 0.8861
ROC AUC: 0.9611
Confusion Matrix:
 [[    0     3     0     0]
 [    0 14947   161    81]
 [    0  1391  2219   141]
 [    0   256   116   942]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.90      0.98      0.94     15189
           2       0.89      0.59      0.71      3751
           3       0.81      0.72      0.76      1314

    accuracy                           0.89     20257
   macro avg       0.65      0.57      0.60     20257
weighted avg       0.89      0.89      0.89     20257

--------------------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

In [57]:
pip install catboost


Note: you may need to restart the kernel to use updated packages.


In [59]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

# Define the models you want to evaluate
models = [
    ("CatBoost Classifier", CatBoostClassifier(verbose=0)),  # CatBoost
    ("Nearest Centroid Classifier", NearestCentroid()),  # Nearest Centroid
    ("Histogram-Based Gradient Boosting", HistGradientBoostingClassifier())  # Histogram-based GB
]

# Function to evaluate the model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For multi-class, predict_proba will return probabilities for all classes
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    # Calculate ROC AUC for multi-class
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    print("="*40)

# Assuming you have your data prepared and scaled
# Example of scaling (make sure you do this before fitting the models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled_dense)
X_test_scaled = scaler.transform(X_test_scaled_dense)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: CatBoost Classifier
              precision    recall  f1-score   support

           1       0.85      0.97      0.91      1521
           2       0.71      0.40      0.51       374
           3       0.62      0.40      0.49       131

    accuracy                           0.83      2026
   macro avg       0.73      0.59      0.63      2026
weighted avg       0.81      0.83      0.81      2026

Error calculating ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'
Model: Nearest Centroid Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.83      0.50      0.62      1521
           2       0.25      0.32      0.28       374
           3       0.18      0.57      0.28       131

    accuracy                           0.47      2026
   macro avg       0.32      0.35      0.29      2026
weighted avg       0.68      0.47      0.54      2026

ROC AUC Score: N/A




Model: Histogram-Based Gradient Boosting
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.86      0.96      0.90      1521
           2       0.70      0.41      0.52       374
           3       0.54      0.44      0.48       131

    accuracy                           0.82      2026
   macro avg       0.52      0.45      0.48      2026
weighted avg       0.81      0.82      0.81      2026

Error calculating ROC AUC: Number of classes in y_true not equal to the number of columns in 'y_score'




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# Convert scaled sparse matrices to dense format for LDA
X_train_dense = X_train_scaled.toarray()
X_test_dense = X_test_scaled.toarray()

# Linear Discriminant Analysis Model
lda_model = LinearDiscriminantAnalysis()

# Fit the model
lda_model.fit(X_train_dense, y_train)

# Make predictions
y_pred = lda_model.predict(X_test_dense)
y_proba = lda_model.predict_proba(X_test_dense)[:, 1] if hasattr(lda_model, "predict_proba") else None

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Multi-class ROC AUC score
if y_proba is not None and len(set(y_test)) > 2:
    try:
        auc_score = roc_auc_score(y_test, lda_model.predict_proba(X_test_dense), multi_class='ovr')
    except ValueError as e:
        auc_score = None
        print(f"Warning: {e}")
elif y_proba is not None:
    auc_score = roc_auc_score(y_test, lda_model.predict_proba(X_test_dense)[:, 1])
else:
    auc_score = None

conf_matrix = confusion_matrix(y_test, y_pred)

# Print out the metrics
print("Model: Linear Discriminant Analysis")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
print(f"Confusion Matrix:\n{conf_matrix}")


Model: Linear Discriminant Analysis
Accuracy: 0.7384
Precision: 0.6958
Recall: 0.7384
F1 Score: 0.6465
ROC AUC: Not applicable
Confusion Matrix:
[[1487    0   34]
 [ 362    4    8]
 [ 124    2    5]]


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# Convert the sparse matrices back to DataFrames for evaluation
X_test_df = pd.DataFrame.sparse.from_spmatrix(X_test_scaled)

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("-" * 50)

# List of models to evaluate
models = [
    ("Neural Network", MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=500)),  # Neural Network
    ("Perceptron", Perceptron(max_iter=500))  # Perceptron
]

# Evaluate models
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: Neural Network
Accuracy: 0.7507
Precision: 0.5636
Recall: 0.7507
F1 Score: 0.6439
ROC AUC: Not applicable
Confusion Matrix:
 [[1521    0    0]
 [ 374    0    0]
 [ 131    0    0]]
--------------------------------------------------
Model: Perceptron
Accuracy: 0.7507
Precision: 0.5636
Recall: 0.7507
F1 Score: 0.6439
ROC AUC: Not applicable
Confusion Matrix:
 [[1521    0    0]
 [ 374    0    0]
 [ 131    0    0]]
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import RidgeClassifier, Lasso
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

# List of models to evaluate
models = [
    ("Ridge Regression", RidgeClassifier()),
    ("Lasso Regression", Lasso(max_iter=500))  # Note: Lasso may not be suitable for classification directly
]

# Function to evaluate models and print performance metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For Lasso, if it's used in classification, we can round predictions to get binary output
    if isinstance(model, Lasso):
        y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Multi-class roc_auc_score
    if y_proba is not None and len(set(y_test)) > 2:
        try:
            auc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
        except ValueError as e:
            auc_score = None
            print(f"Warning: {e}")
    elif y_proba is not None:
        auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc_score = None

    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print out the metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("-" * 50)

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)


Model: Ridge Regression
Accuracy: 0.7498
Precision: 0.5908
Recall: 0.7498
F1 Score: 0.6448
ROC AUC: Not applicable
Confusion Matrix:
 [[1518    3    0]
 [ 373    1    0]
 [ 128    3    0]]
--------------------------------------------------
Model: Lasso Regression
Accuracy: 0.7507
Precision: 0.5636
Recall: 0.7507
F1 Score: 0.6439
ROC AUC: Not applicable
Confusion Matrix:
 [[1521    0    0]
 [ 374    0    0]
 [ 131    0    0]]
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
smote

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import xgboost as xgb

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# First set of 5 models
models_1 = [
    ("Logistic Regression", 
     Pipeline([('scaler', scaler), 
               ('classifier', LogisticRegression(max_iter=3000, class_weight='balanced'))])),
    
    ("Decision Tree", 
     Pipeline([('scaler', scaler), 
               ('classifier', DecisionTreeClassifier(class_weight='balanced'))])),
    
    ("Random Forest", 
     Pipeline([('scaler', scaler), 
               ('classifier', RandomForestClassifier(random_state=42))])),
    
    ("Gradient Boosting", 
     Pipeline([('scaler', scaler), 
               ('classifier', GradientBoostingClassifier(random_state=42))])),
    
    
]

# Loop through models and evaluate
for name, model in models_1:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.33      0.00         3
           1       0.84      0.59      0.69     15189
           2       0.26      0.28      0.27      3751
           3       0.17      0.57      0.26      1314

    accuracy                           0.53     20257
   macro avg       0.32      0.44      0.31     20257
weighted avg       0.69      0.53      0.59     20257

ROC AUC Score: 0.7251219333132617
Confusion Matrix:
 [[   1    1    1    0]
 [ 777 8982 2799 2631]
 [ 221 1439 1058 1033]
 [ 102  273  191  748]]
Model: Decision Tree
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.91      0.86      0.88     15189
           2       0.54      0.61      0.57      3751
           3       0.52      0.62      0.57      1314

    accuracy                           0.80     20257
   macro avg       0.49      0.52      0.51  

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import SMOTE
from scipy import sparse

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes to reduce memory usage
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'  # Adjust if needed based on your encoded column names

# Ensure the target column exists
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found. Please check the column names.")

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Optionally downsample the dataset (use a fraction of the data)
sample_fraction = 0.1  # Adjust the fraction as necessary to reduce the dataset size
data_sampled = data.sample(frac=sample_fraction, random_state=42)

X_sampled = data_sampled.drop([target_column], axis=1)
y_sampled = data_sampled[target_column]

# Convert the features to a sparse matrix for memory efficiency
X_sparse = sparse.csr_matrix(X_sampled)

# Train-test split
X_train_sparse, X_test_sparse, y_train, y_test = train_test_split(X_sparse, y_sampled, test_size=0.2, random_state=42)

# Check the class distribution in the training set
print("Class distribution before SMOTE:")
print(y_train.value_counts())

# Apply SMOTE to balance the classes
# Set k_neighbors=1 to avoid issues with few minority class samples
smote = SMOTE(random_state=42, k_neighbors=1)  
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sparse.toarray(), y_train)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test_sparse.toarray())

# Linear Discriminant Analysis Model
lda_model = LinearDiscriminantAnalysis()

# Fit the model
lda_model.fit(X_train_scaled, y_train_resampled)

# Make predictions
y_pred = lda_model.predict(X_test_scaled)
y_proba = lda_model.predict_proba(X_test_scaled)[:, 1] if hasattr(lda_model, "predict_proba") else None

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Multi-class ROC AUC score
if y_proba is not None and len(set(y_test)) > 2:
    try:
        auc_score = roc_auc_score(y_test, lda_model.predict_proba(X_test_scaled), multi_class='ovr')
    except ValueError as e:
        auc_score = None
        print(f"Warning: {e}")
elif y_proba is not None:
    auc_score = roc_auc_score(y_test, lda_model.predict_proba(X_test_scaled)[:, 1])
else:
    auc_score = None

conf_matrix = confusion_matrix(y_test, y_pred)

# Print out the metrics
print("Model: Linear Discriminant Analysis with SMOTE")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {auc_score:.4f}" if auc_score else "ROC AUC: Not applicable")
print(f"Confusion Matrix:\n{conf_matrix}")


Class distribution before SMOTE:
Risk
1    6102
2    1476
3     522
0       2
Name: count, dtype: int64
Model: Linear Discriminant Analysis with SMOTE
Accuracy: 0.5123
Precision: 0.6827
Recall: 0.5123
F1 Score: 0.5700
ROC AUC: Not applicable
Confusion Matrix:
[[  0   0   0   0]
 [121 843 308 249]
 [ 20 143 119  92]
 [  4  32  19  76]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Second set of 5 models
models_2 = [
    ("K-Nearest Neighbors", 
     Pipeline([('scaler', scaler), 
               ('classifier', KNeighborsClassifier())])),
    
    ("Naive Bayes", 
     Pipeline([('scaler', scaler), 
               ('classifier', GaussianNB())])),
    
    ("AdaBoost", 
     Pipeline([('scaler', scaler), 
               ('classifier', AdaBoostClassifier())])),
    
    ("Bagging Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', BaggingClassifier(n_estimators=50, random_state=42))])),
    
    ("Extra Trees Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', ExtraTreesClassifier(random_state=42))]))
]

# Loop through models and evaluate
for name, model in models_2:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: K-Nearest Neighbors
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.87      0.74      0.80     15189
           2       0.34      0.46      0.39      3751
           3       0.39      0.65      0.49      1314

    accuracy                           0.68     20257
   macro avg       0.40      0.46      0.42     20257
weighted avg       0.74      0.68      0.70     20257

ROC AUC Score: 0.7014533119797475
Confusion Matrix:
 [[    0     1     2     0]
 [   18 11216  3092   863]
 [    5  1536  1732   478]
 [    3   205   255   851]]
Model: Naive Bayes
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         3
           1       0.87      0.28      0.42     15189
           2       0.19      0.44      0.27      3751
           3       0.15      0.53      0.23      1314

    accuracy                           0.32     20257
   macro avg       0.30      0.



Model: AdaBoost
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.82      0.71      0.76     15189
           2       0.21      0.21      0.21      3751
           3       0.26      0.64      0.37      1314

    accuracy                           0.61     20257
   macro avg       0.32      0.39      0.34     20257
weighted avg       0.67      0.61      0.64     20257

ROC AUC Score: 0.6606811080378101
Confusion Matrix:
 [[    0     1     0     2]
 [   54 10812  2790  1533]
 [   14  2048   788   901]
 [   11   273   185   845]]
Model: Bagging Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.91      0.93      0.92     15189
           2       0.72      0.61      0.66      3751
           3       0.61      0.71      0.66      1314

    accuracy                           0.86     20257
   macro avg       0.56      0.56  

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Model list with only the specified classifiers
models = [
    ("Quadratic Discriminant Analysis", 
     Pipeline([('scaler', scaler), 
               ('classifier', QuadraticDiscriminantAnalysis())])),
    
    ("Histogram-Based Gradient Boosting", 
     Pipeline([('scaler', scaler), 
               ('classifier', HistGradientBoostingClassifier(random_state=42))])),
    
    ("XGBoost", 
     Pipeline([('scaler', scaler), 
               ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])),
    
    ("LightGBM", 
     Pipeline([('scaler', scaler), 
               ('classifier', lgb.LGBMClassifier(random_state=42))])),
    
    ("CatBoost Classifier", 
     Pipeline([('scaler', scaler), 
               ('classifier', CatBoostClassifier(silent=True))]))
]

# Loop through models and evaluate
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)




Model: Quadratic Discriminant Analysis
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.80      0.79      0.80     15189
           2       0.31      0.22      0.26      3751
           3       0.24      0.47      0.32      1314

    accuracy                           0.67     20257
   macro avg       0.34      0.37      0.34     20257
weighted avg       0.68      0.67      0.67     20257

ROC AUC Score: 0.629111773826184
Confusion Matrix:
 [[    0     2     0     1]
 [    1 12054  1719  1415]
 [    0  2375   831   545]
 [    1   571   129   613]]
Model: Histogram-Based Gradient Boosting
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.90      0.93      0.91     15189
           2       0.72      0.49      0.59      3751
           3       0.55      0.78      0.65      1314

    accuracy                           0.84     2025

Parameters: { "use_label_encoder" } are not used.



Model: XGBoost
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.90      0.95      0.92     15189
           2       0.77      0.54      0.63      3751
           3       0.61      0.79      0.69      1314

    accuracy                           0.86     20257
   macro avg       0.57      0.57      0.56     20257
weighted avg       0.86      0.86      0.85     20257

ROC AUC Score: 0.9335244816578858
Confusion Matrix:
 [[    0     1     1     1]
 [   15 14358   520   296]
 [    4  1360  2026   361]
 [    1   191    87  1035]]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3067
[LightGBM] [Info] Number of data points in the train set: 243764, number of used features: 15
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score 

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Model list including Neural Network and Perceptron
models = [
    ("Neural Network", 
     Pipeline([('scaler', scaler), 
               ('classifier', MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=500))])),
    
    ("Perceptron", 
     Pipeline([('scaler', scaler), 
               ('classifier', Perceptron(max_iter=500))]))
]

# Loop through models and evaluate
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Neural Network
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.85      0.78      0.81     15189
           2       0.29      0.29      0.29      3751
           3       0.37      0.74      0.49      1314

    accuracy                           0.68     20257
   macro avg       0.38      0.45      0.40     20257
weighted avg       0.72      0.68      0.69     20257

ROC AUC Score: 0.8094946690821875
Confusion Matrix:
 [[    0     2     1     0]
 [   27 11797  2481   884]
 [    3  1906  1081   761]
 [    4   156   187   967]]
Model: Perceptron
              precision    recall  f1-score   support

           0       0.00      0.33      0.00         3
           1       0.80      0.46      0.58     15189
           2       0.19      0.28      0.22      3751
           3       0.13      0.52      0.21      1314

    accuracy                           0.43     20257
   macro avg       0.28      0.40    

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import NearestCentroid
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# Nearest Centroid Classifier with scaling
nearest_centroid_model = Pipeline([
    ('scaler', scaler),
    ('classifier', NearestCentroid())
])

# Evaluate Nearest Centroid Classifier
evaluate_model("Nearest Centroid Classifier", nearest_centroid_model, X_train_resampled, X_test, y_train_resampled, y_test)


Model: Nearest Centroid Classifier
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         3
           1       0.84      0.51      0.63     15189
           2       0.24      0.30      0.27      3751
           3       0.17      0.52      0.26      1314

    accuracy                           0.47     20257
   macro avg       0.31      0.58      0.29     20257
weighted avg       0.69      0.47      0.54     20257

ROC AUC Score: N/A
Confusion Matrix:
 [[   3    0    0    0]
 [1856 7710 3331 2292]
 [ 492 1200 1110  949]
 [ 209  243  183  679]]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import RidgeClassifier, Lasso
from imblearn.over_sampling import SMOTE
from scipy import sparse
import numpy as np

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For Lasso, convert continuous predictions to binary predictions
    if isinstance(model, Lasso):
        y_pred = (y_pred > 0.5).astype(int)  # Convert continuous output to binary using threshold
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba)
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# List of models to evaluate
models = [
    ("Ridge Regression", RidgeClassifier()),
    ("Lasso Regression", Lasso(max_iter=500))  # Note: Lasso is typically not for classification
]

# Loop through models and evaluate each
for name, model in models:
    evaluate_model(name, model, X_train_resampled, X_test, y_train_resampled, y_test)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Model: Ridge Regression
              precision    recall  f1-score   support

           0       0.00      1.00      0.00         3
           1       0.84      0.55      0.66     15189
           2       0.26      0.19      0.22      3751
           3       0.18      0.58      0.28      1314

    accuracy                           0.49     20257
   macro avg       0.32      0.58      0.29     20257
weighted avg       0.69      0.49      0.56     20257

ROC AUC Score: N/A
Confusion Matrix:
 [[   3    0    0    0]
 [2489 8358 1955 2387]
 [ 628 1385  725 1013]
 [ 220  241   96  757]]
Model: Lasso Regression
              precision    recall  f1-score   support

           0       0.01      0.67      0.01         3
           1       0.75      0.99      0.85     15189
           2       0.00      0.00      0.00      3751
           3       0.00      0.00      0.00      1314

    accuracy                           0.74     20257
   macro avg       0.19      0.41      0.22     20257
weight

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('updated_food_inspection_cleaned.csv')

# Convert categorical columns to category codes
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category').cat.codes

# Define the target column
target_column = 'Risk'

# Define features (X) and target (y)
X = data.drop([target_column], axis=1)
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Predict probabilities for models that support it
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    
    if y_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            print("ROC AUC Score:", roc_auc)
        except ValueError as e:
            print("Error calculating ROC AUC:", e)
    else:
        print("ROC AUC Score: N/A")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n {conf_matrix}")
    print("="*40)

# SVM Classifier with scaling
svm_model = Pipeline([
    ('scaler', scaler),
    ('classifier', SVC(probability=True, random_state=42))  # Set probability=True for ROC AUC score
])

# Evaluate SVM Classifier
evaluate_model("Support Vector Machine", svm_model, X_train_resampled, X_test, y_train_resampled, y_test)
