In [1]:
import pandas as pd
import numpy as np

In [13]:
df=pd.read_csv('cleaned.csv')

In [14]:
df.drop(['age','frequency_score','awareness_score','cf_ab_score','zone_score','income_score','zas_score','bsi'],inplace=True,axis=1)

In [15]:
df.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group
0,R00001,M,Urban,Working Professional,<10L,3-4 times,Newcomer,Medium (500 ml),0 to 1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35
1,R00002,F,Metro,Working Professional,> 35L,5-7 times,Established,Medium (500 ml),2 to 4,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55
2,R00003,F,Rural,Working Professional,> 35L,3-4 times,Newcomer,Medium (500 ml),2 to 4,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45
3,R00004,F,Urban,Working Professional,16L - 25L,5-7 times,Newcomer,Medium (500 ml),0 to 1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35
4,R00005,M,Metro,Student,Not Reported,3-4 times,Established,Medium (500 ml),0 to 1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# Define target variable
y = df['price_range']

# Define feature matrix X (excluding 'respondent_id' and the target 'price_range')
X = df.drop(columns=['respondent_id', 'price_range'])

In [17]:
print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

Shape of X (features): (29956, 15)
Shape of y (target): (29956,)


In [18]:
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)


In [19]:
label_encode_cols = [
    'age_group',
    'income_levels',
    'health_concerns',
    'consume_frequency(weekly)',
    'preferable_consumption_size'
]

In [20]:
# Identify all categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remaining categorical columns for One-Hot Encoding
one_hot_encode_cols = [col for col in categorical_cols if col not in label_encode_cols]

In [11]:
one_hot_encode_cols

['gender',
 'zone',
 'occupation',
 'current_brand',
 'awareness_of_other_brands',
 'reasons_for_choosing_brands',
 'flavor_preference',
 'purchase_channel',
 'packaging_preference',
 'typical_consumption_situations']

In [21]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

[]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42
)

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        # OrdinalEncoder for columns specified for Label Encoding
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), label_encode_cols),
        # OneHotEncoder for all remaining categorical columns
        ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_encode_cols),
        # Pass through numerical columns without transformation
        ('passthrough', 'passthrough', numerical_cols)
    ],
    remainder='drop' # Drop any columns not specified in transformers
)

In [32]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [33]:
print(f"Shape of encoded X_train: {X_train_encoded.shape}")
print(f"Shape of encoded X_test: {X_test_encoded.shape}")

Shape of encoded X_train: (22467, 34)
Shape of encoded X_test: (7489, 34)


In [34]:
models = {
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=2000), # Increased max_iter for robustness
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), # Suppress warnings
    "Light Gradient Boosting Machine (Light GBM)": LGBMClassifier(random_state=42)
}

In [35]:
model_performance = {}

In [36]:
for name, model in models.items():
    print(f"\n--- Training and Evaluating: {name} ---")
    try:
        # Train the model
        model.fit(X_train_encoded, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test_encoded)

        # Calculate and store performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=le_y.classes_)

        model_performance[name] = {
            "Accuracy": accuracy,
            "Classification Report": report
        }

        # Print performance metrics
        print(f"Accuracy for {name}: {accuracy:.4f}")
        print(f"Classification Report for {name}:\n{report}")
    except Exception as e:
        print(f"An error occurred during training/evaluation of {name}: {e}")
        model_performance[name] = {"Accuracy": 0, "Classification Report": f"Error: {e}"}


--- Training and Evaluating: Gaussian Naive Bayes ---
Accuracy for Gaussian Naive Bayes: 0.5421
Classification Report for Gaussian Naive Bayes:
              precision    recall  f1-score   support

     100-150       0.46      0.24      0.31      1930
     150-200       0.57      0.25      0.35      2223
     200-250       0.64      0.90      0.75      2430
      50-100       0.41      0.95      0.57       906

    accuracy                           0.54      7489
   macro avg       0.52      0.58      0.49      7489
weighted avg       0.55      0.54      0.50      7489


--- Training and Evaluating: Logistic Regression ---
Accuracy for Logistic Regression: 0.7611
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

     100-150       0.72      0.73      0.72      1930
     150-200       0.69      0.71      0.70      2223
     200-250       0.86      0.85      0.85      2430
      50-100       0.78      0.74      0.76       906

    ac

Parameters: { "use_label_encoder" } are not used.



Accuracy for XGBoost: 0.9167
Classification Report for XGBoost:
              precision    recall  f1-score   support

     100-150       0.91      0.90      0.90      1930
     150-200       0.88      0.91      0.90      2223
     200-250       0.96      0.94      0.95      2430
      50-100       0.92      0.91      0.91       906

    accuracy                           0.92      7489
   macro avg       0.92      0.91      0.92      7489
weighted avg       0.92      0.92      0.92      7489


--- Training and Evaluating: Light Gradient Boosting Machine (Light GBM) ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 22467, number of used features: 34
[LightGBM] [Info] Start training from score -1.343386
[LightGBM] [Info] 



In [37]:
model_performance

{'Gaussian Naive Bayes': {'Accuracy': 0.5421284550674322,
  'Classification Report': '              precision    recall  f1-score   support\n\n     100-150       0.46      0.24      0.31      1930\n     150-200       0.57      0.25      0.35      2223\n     200-250       0.64      0.90      0.75      2430\n      50-100       0.41      0.95      0.57       906\n\n    accuracy                           0.54      7489\n   macro avg       0.52      0.58      0.49      7489\nweighted avg       0.55      0.54      0.50      7489\n'},
 'Logistic Regression': {'Accuracy': 0.7611163039124048,
  'Classification Report': '              precision    recall  f1-score   support\n\n     100-150       0.72      0.73      0.72      1930\n     150-200       0.69      0.71      0.70      2223\n     200-250       0.86      0.85      0.85      2430\n      50-100       0.78      0.74      0.76       906\n\n    accuracy                           0.76      7489\n   macro avg       0.76      0.75      0.76    

In [39]:
best_model_name = ""
best_accuracy = -1.0

In [40]:
# Iterate through the stored performance to find the best model
for name, metrics in model_performance.items():
    print(f"\nModel: {name}")
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    # The full classification report is already printed above for each model.

    if metrics['Accuracy'] > best_accuracy:
        best_accuracy = metrics['Accuracy']
        best_model_name = name

print(f"\nBest Performing Model: {best_model_name} with an Accuracy of {best_accuracy:.4f}")


Model: Gaussian Naive Bayes
  Accuracy: 0.5421

Model: Logistic Regression
  Accuracy: 0.7611

Model: Support Vector Machine (SVM)
  Accuracy: 0.8418

Model: Random Forest
  Accuracy: 0.8614

Model: XGBoost
  Accuracy: 0.9167

Model: Light Gradient Boosting Machine (Light GBM)
  Accuracy: 0.9164

Best Performing Model: XGBoost with an Accuracy of 0.9167


### dagshub


In [44]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm

In [None]:
# dagshub setup

import dagshub
dagshub.init(repo_owner='learnpythonlanguage', repo_name='mlflow_dagshub_demo', mlflow=True)

In [47]:
import dagshub