In [75]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib.cm import get_cmap
from matplotlib.colors import Normalize

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report

from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [76]:
# Reimport the data
df_cleaned = pd.read_csv('train.csv')

num_features = df_cleaned.select_dtypes(include='number').drop('Scoville Heat Units (SHU)', axis = 1).columns

for col in num_features:
    # Impute missing values with median
    df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())
    
    # Remove outliers
    q99 = df_cleaned[col].quantile(0.995)
    q01 = df_cleaned[col].quantile(0.001)

    # Filter rows — keep only values within the quantile range
    df_cleaned = df_cleaned[(df_cleaned[col] <= q99) & (df_cleaned[col] >= q01)]
    
DROP_COLS = ['color', 'Harvest Time', 'Average Temperature During Storage (celcius)']
drop_columns = FunctionTransformer(lambda df: df.drop(columns=DROP_COLS, axis = 1))

# Preprocessing pipeline
preprocessor = Pipeline([
    ('drop_cols', drop_columns),
    ('scaler', StandardScaler())
])

In [77]:
# Splitting data
X = df_cleaned.drop(columns=['Scoville Heat Units (SHU)'])
y = df_cleaned['Scoville Heat Units (SHU)']


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Evaluate function
def evaluate_model(name, model, param_grid):
    print(f"\n Grid Search for {name}")
    
    pipe = Pipeline ([
        ('preprocess', preprocessor),
        ('regressor', model)
    ])
    
    # Adjust param grid to reflect the pipeline's step name
    param_grid = {f'regressor__{key}': val for key, val in param_grid.items()}

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5,
                        scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train) 

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Best Params: {grid.best_params_}")
    print(f"MSE: {mse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return best_model

### (A) REGRESSION ANALYSIS

In [79]:
# 1. Linear Regression
lr_model = evaluate_model("Linear Regression", LinearRegression(), {})

# 2. Ridge Regression
ridge_model = evaluate_model("Ridge Regression", Ridge(), {'alpha': [0.01, 0.1, 1, 10]})

# 3. Lasso Regression
lasso_model = evaluate_model("Lasso Regression", Lasso(), {'alpha': [0.01, 0.1, 1, 10]})

# 4. Decision Tree
dt_model = evaluate_model("Decision Tree", DecisionTreeRegressor(random_state=42), {
    'max_depth': [3, 5, 10, None]
})

# 5. Random Forest
rf_model = evaluate_model("Random Forest", RandomForestRegressor(random_state=42), {
    'n_estimators': [50, 100],
    'max_depth': [None, 10]
})

# 6. Gradient Boosting
gb_model = evaluate_model("Gradient Boosting", GradientBoostingRegressor(random_state=42), {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
})

# 7. Support Vector Regressor
svr_model = evaluate_model("Support Vector Regressor", SVR(), {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
})


 Grid Search for Linear Regression
Best Params: {}
MSE: 9506577332.59
R² Score: 0.2187

 Grid Search for Ridge Regression
Best Params: {'regressor__alpha': 10}
MSE: 9504270625.54
R² Score: 0.2189

 Grid Search for Lasso Regression
Best Params: {'regressor__alpha': 10}
MSE: 9506554709.58
R² Score: 0.2187

 Grid Search for Decision Tree
Best Params: {'regressor__max_depth': 3}
MSE: 10352561694.54
R² Score: 0.1492

 Grid Search for Random Forest
Best Params: {'regressor__max_depth': 10, 'regressor__n_estimators': 100}
MSE: 9308571875.01
R² Score: 0.2350

 Grid Search for Gradient Boosting
Best Params: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 50}
MSE: 9497100049.68
R² Score: 0.2195

 Grid Search for Support Vector Regressor
Best Params: {'regressor__C': 10, 'regressor__kernel': 'linear'}
MSE: 16740657601.36
R² Score: -0.3758


### (B) Multi-class classification analysis with an ensemble classifier.

In [80]:
# Define bin count
num_bin = 50
X_binned = X.copy()
y_binned = pd.qcut(y, q=num_bin, labels=False, duplicates='drop')

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_binned, y_binned, test_size=0.2, random_state=42)

# Build classification pipeline
clf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10]
}

grid_clf = GridSearchCV(clf_pipeline, param_grid=param_grid_rf, 
                        cv=5, scoring='f1_macro')

grid_clf.fit(X_train, y_train)
print('Best hyperparameters:', grid_clf.best_params_)

# Evaluate
y_pred = grid_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy score:', accuracy)
print(classification_report(y_test, y_pred))

Best hyperparameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}
Accuracy score: 0.49732620320855614
              precision    recall  f1-score   support

           0       0.59      1.00      0.74        92
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         5
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         5
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         4
          13       1.00      0.25      0.40         4
          14       0.00      0.00      0.00         8
          15  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [81]:
for bins in [3, 5, 7, 10]:
    y_binned = pd.qcut(y, q=bins, labels=False, duplicates= 'drop')
    scores = cross_val_score(clf_pipeline, X, y_binned, cv=5, scoring='accuracy')
    print(f"{bins} bins → Accuracy: {np.mean(scores):.4f}")

3 bins → Accuracy: 0.7390
5 bins → Accuracy: 0.6545
7 bins → Accuracy: 0.6385
10 bins → Accuracy: 0.5947


In [82]:
df_test = pd.read_csv('test.csv')
# Bin original y (for training and mapping reference)
y_binned = pd.qcut(y, q=num_bin, labels=False, duplicates='drop')

# Now, get the bin intervals from the dtype of the binned series
bin_intervals = pd.qcut(y, q=num_bin, duplicates='drop').dtype.categories

bin_midpoints = [interval.mid for interval in bin_intervals]

# Predict bin labels on unseen/test data
y_pred_bins = grid_clf.predict(df_test) 

# Convert bin label to SHU approximation using bin midpoints
y_pred_shu = [bin_midpoints[int(bin_label)] for bin_label in y_pred_bins]

y_test_kaggle = pd.DataFrame(y_pred_shu, columns=["Scoville Heat Units (SHU)"])
y_test_kaggle.index.name = "index"
y_test_kaggle[['Scoville Heat Units (SHU)']].to_csv("kaggle.csv")

In [83]:
bin_intervals

IntervalIndex([      (-0.001, 1687.746],    (1687.746, 11696.862],
                 (11696.862, 21014.122],    (21014.122, 33524.02],
                  (33524.02, 44015.419],   (44015.419, 55603.204],
                 (55603.204, 69801.462],   (69801.462, 81131.709],
                 (81131.709, 92165.908],  (92165.908, 104195.538],
               (104195.538, 115786.399],  (115786.399, 130505.42],
                (130505.42, 144631.227], (144631.227, 156024.772],
               (156024.772, 172074.327], (172074.327, 181797.978],
               (181797.978, 195615.438],  (195615.438, 223328.71],
                (223328.71, 242854.924], (242854.924, 264765.219],
               (264765.219, 290983.956], (290983.956, 325945.179],
               (325945.179, 377966.343],  (377966.343, 527639.86]],
              dtype='interval[float64, right]')

**Comment**: 

Accuracy vs. Number of Bins

As the number of bins decreases (e.g., from 10 to 3), classification accuracy tends to increase. This is because it's easier for the classifier to correctly predict broader categories. For example, since a large portion of the dataset consists of peppers with a SHU of 0, the classifier can perform well simply by assigning many samples to the first bin (which includes 0). This boosts accuracy but oversimplifies the predictions.


Impact on SHU Prediction Error (MAE or MSE)

However, when we convert bin predictions back into SHU values (e.g., using the midpoint of each interval), this coarse binning becomes problematic. If the first bin covers a wide range, say SHU 0 to 1600, and its midpoint is around 800, then all predictions in that bin are assigned ~800 — even though many actual values are 0. This creates large errors in SHU estimation (e.g., MAE or MSE).


Effect of Increasing Number of Bins

Using more bins creates finer granularity, allowing the model to make more precise distinctions between SHU ranges. This reduces the error when mapping bins back to SHU values. However, as the number of bins increases too much (e.g., beyond 100), bins may become too narrow, class imbalance increases, and the classifier may struggle. As a result, SHU prediction error (e.g., on Kaggle) stops improving significantly beyond a certain point.

### (C) A two-step analysis (two sequential pipelines)


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_mapped = np.where(y_train > 0, 1, 0)

In [85]:
class_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


param_grid1 = {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [10, 20],
'classifier__min_samples_split': [5, 10],
'classifier__min_samples_leaf': [2, 4],
}

gs_class = GridSearchCV(class_pipe, param_grid1, cv=5, scoring='accuracy')
gs_class.fit(X_train, y_train_mapped)


best_classifier = gs_class.best_estimator_
class_pred = best_classifier.predict(X)

In [86]:
spicy_peppers_indices = np.where(class_pred == 1)[0]

X_reg = X.iloc[spicy_peppers_indices]
y_reg = y.iloc[spicy_peppers_indices]

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42, stratify=y)

ValueError: Found input variables with inconsistent numbers of samples: [438, 935]

In [None]:
reg_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_grid2 = {
'regressor__n_estimators': [100, 200],
'regressor__max_depth': [10, 20],
'regressor__min_samples_split': [5, 10],
'regressor__min_samples_leaf': [2, 4],
}

gs_reg = GridSearchCV(reg_pipe, param_grid2, cv=5, scoring= 'neg_mean_squared_error')
gs_reg.fit(X_reg_train, y_reg_train)

best_regressor = gs_reg.best_estimator_
reg_pred = best_regressor.predict(X_reg)

NameError: name 'X_reg_train' is not defined