In [63]:
import pandas as pd
import numpy as np

In [64]:
df=pd.read_csv('cleaned.csv')

In [65]:
df.drop(['age','frequency_score','awareness_score','cf_ab_score','zone_score','income_score','zas_score','bsi'],inplace=True,axis=1)

In [66]:
df.head()

Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group
0,R00001,M,Urban,Working Professional,<10L,3-4 times,Newcomer,Medium (500 ml),0 to 1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35
1,R00002,F,Metro,Working Professional,> 35L,5-7 times,Established,Medium (500 ml),2 to 4,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55
2,R00003,F,Rural,Working Professional,> 35L,3-4 times,Newcomer,Medium (500 ml),2 to 4,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45
3,R00004,F,Urban,Working Professional,16L - 25L,5-7 times,Newcomer,Medium (500 ml),0 to 1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35
4,R00005,M,Metro,Student,Not Reported,3-4 times,Established,Medium (500 ml),0 to 1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [68]:
# Define target variable
y = df['price_range']


# Define feature matrix X (excluding 'respondent_id' and the target 'price_range')
X = df.drop(columns=['respondent_id', 'price_range'])

In [69]:
print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

Shape of X (features): (29956, 15)
Shape of y (target): (29956,)


In [70]:
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)


In [71]:
import joblib
joblib.dump(le_y, 'label_encoder_y.joblib')

['label_encoder_y.joblib']

In [43]:
label_encode_cols = [
    'age_group',
    'income_levels',
    'health_concerns',
    'consume_frequency(weekly)',
    'preferable_consumption_size'
]

In [44]:
# Identify all categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remaining categorical columns for One-Hot Encoding
one_hot_encode_cols = [col for col in categorical_cols if col not in label_encode_cols]

In [45]:
one_hot_encode_cols

['gender',
 'zone',
 'occupation',
 'current_brand',
 'awareness_of_other_brands',
 'reasons_for_choosing_brands',
 'flavor_preference',
 'purchase_channel',
 'packaging_preference',
 'typical_consumption_situations']

In [46]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

[]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42
)

In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        # OrdinalEncoder for columns specified for Label Encoding
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), label_encode_cols),
        # OneHotEncoder for all remaining categorical columns
        ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_encode_cols),
        # Pass through numerical columns without transformation
        ('passthrough', 'passthrough', numerical_cols)
    ],
    remainder='drop' # Drop any columns not specified in transformers
)

In [50]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [16]:
print(f"Shape of encoded X_train: {X_train_encoded.shape}")
print(f"Shape of encoded X_test: {X_test_encoded.shape}")

Shape of encoded X_train: (22467, 34)
Shape of encoded X_test: (7489, 34)


In [32]:
import joblib
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [17]:
# models = {
#     "Gaussian Naive Bayes": GaussianNB(),
#     "Logistic Regression": LogisticRegression(), # Increased max_iter for robustness
#     "Support Vector Machine (SVM)": SVC(),
#     "Random Forest": RandomForestClassifier(),
#     "XGBoost": XGBClassifier(), # Suppress warnings
#     "Light Gradient Boosting Machine (Light GBM)": LGBMClassifier()
# }

In [18]:
# model_performance = {}

In [19]:
# for name, model in models.items():
#     print(f"\n--- Training and Evaluating: {name} ---")
#     try:
#         # Train the model
#         model.fit(X_train_encoded, y_train)

#         # Make predictions on the test set
#         y_pred = model.predict(X_test_encoded)

#         # Calculate and store performance metrics
#         accuracy = accuracy_score(y_test, y_pred)
#         report = classification_report(y_test, y_pred, target_names=le_y.classes_)

#         model_performance[name] = {
#             "Accuracy": accuracy,
#             "Classification Report": report
#         }

#         # Print performance metrics
#         print(f"Accuracy for {name}: {accuracy:.4f}")
#         print(f"Classification Report for {name}:\n{report}")
#     except Exception as e:
#         print(f"An error occurred during training/evaluation of {name}: {e}")
#         model_performance[name] = {"Accuracy": 0, "Classification Report": f"Error: {e}"}

In [20]:
# best_model_name = ""
# best_accuracy = -1.0

In [21]:
# # Iterate through the stored performance to find the best model
# for name, metrics in model_performance.items():
#     print(f"\nModel: {name}")
#     print(f"  Accuracy: {metrics['Accuracy']:.4f}")
#     # The full classification report is already printed above for each model.

#     if metrics['Accuracy'] > best_accuracy:
#         best_accuracy = metrics['Accuracy']
#         best_model_name = name

# print(f"\nBest Performing Model: {best_model_name} with an Accuracy of {best_accuracy:.4f}")

In [None]:
# models = {
#     "Gaussian Naive Bayes": GaussianNB(),
#     "Logistic Regression": LogisticRegression(random_state=42, max_iter=2000), # Increased max_iter for robustness
#     "Support Vector Machine (SVM)": SVC(random_state=42),
#     "Random Forest": RandomForestClassifier(random_state=42),
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), # Suppress warnings
#     "Light Gradient Boosting Machine (Light GBM)": LGBMClassifier(random_state=42)
# }

In [28]:
models = [
    (
        "Gaussian Naive Bayes", 
        GaussianNB(), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    ),
    (
        "Logistic Regression", 
        LogisticRegression(random_state=42, max_iter=2000), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    ),
    (
        "Support Vector Machine (SVM)", 
        SVC(), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    ),
    (
        "Random Forest", 
        RandomForestClassifier(), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    ),
    (
        "XGBoost", 
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    ),
    (
        "Light Gradient Boosting Machine (Light GBM)", 
        LGBMClassifier(random_state=42), 
        (X_train_encoded, y_train),
        (X_test_encoded, y_test)
    )
]

In [29]:
reports = []

In [30]:
import joblib

In [None]:


for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    report1 = classification_report(y_test, y_pred)
    print(model_name)
    print(report1)
    print('\n')
    print("-"*20)
    print('\n')
    reports.append(report)
    if("XGB" in model_name):
        print('saving the model ',model_name)
        print(model)
        joblib.dump(model, 'xgb_model.pkl')

In [34]:
len(reports)


6

### testing

In [33]:
# Load the preprocessor
preprocessor_test = joblib.load('preprocessor.pkl')

# Load the model
model_test = joblib.load('xgb_model.pkl')


In [58]:
ip_data_text=preprocessor_test.transform(X_test)

In [59]:
predic_test=model_test.predict(ip_data_text)

In [61]:
accuracy_score(y_test,predic_test)

0.9166777940980104

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        # OrdinalEncoder for columns specified for Label Encoding
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), label_encode_cols),
        # OneHotEncoder for all remaining categorical columns
        ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_encode_cols),
        # Pass through numerical columns without transformation
        ('passthrough', 'passthrough', numerical_cols)
    ],
    remainder='drop' # Drop any columns not specified in transformers
)

In [72]:
le_y.inverse_transform(predic_test)

array(['100-150', '150-200', '200-250', ..., '200-250', '150-200',
       '200-250'], dtype=object)

### mlflow

In [22]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm

In [28]:
mlflow.set_experiment("Beverage Price Prediction1")
mlflow.set_tracking_uri("http://localhost:5000") 

2025/06/18 19:33:00 INFO mlflow.tracking.fluent: Experiment with name 'Beverage Price Prediction1' does not exist. Creating a new experiment.


In [None]:

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_param("model", model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('recall_class_2', report['1']['recall'])
        mlflow.log_metric('recall_class_3', report['1']['recall'])
        mlflow.log_metric('recall_class_4', report['1']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        elif "Light GBM" in model_name:
            mlflow.lightgbm.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model") 


### dagshub


In [35]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm

In [36]:
import dagshub
dagshub.init(repo_owner='vinaysvinays456', repo_name='Beverage-Price-Prediction', mlflow=True)


In [37]:
import os

In [38]:
import joblib

In [40]:
#dagshub.init(repo_owner='vinaysvinays456', repo_name='Beverage-Price-Prediction', mlflow=True)

os.environ['MLFLOW_TRACKING_USERNAME'] = 'vinaysvinays456' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9467c5eb54b1fe4fa1963d9994fa658202dff988' # 
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/vinaysvinays456/Beverage-Price-Prediction.mlflow' 


mlflow.set_experiment("Beverage Price Prediction")
#mlflow.set_tracking_uri("https://dagshub.com/vinaysvinays456/Beverage-Price-Prediction.mlflow") 

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    print(model_name,model,0)
    with mlflow.start_run(run_name=model_name):        
        print(model_name,1)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_2', report['2']['recall'])
        mlflow.log_metric('recall_class_3', report['3']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        
        
        print(2)
        
        if "XGB" in model_name:
            print("xgb")
            mlflow.xgboost.log_model(model, "model")
            
        elif "Light GBM" in model_name:
            print("lightbg")
            mlflow.lightgbm.log_model(model, "model")
            
        else:
            print("else")
            mlflow.sklearn.log_model(model, "model") 
        print(3)


Gaussian Naive Bayes GaussianNB() 0
Gaussian Naive Bayes 1
2
else
3
Logistic Regression LogisticRegression(max_iter=2000, random_state=42) 0
Logistic Regression 1
2
else
3
Support Vector Machine (SVM) SVC() 0
Support Vector Machine (SVM) 1
2
else
3
Random Forest RandomForestClassifier() 0
Random Forest 1
2
else




3
XGBoost XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=42, reg_alpha=0, ...) 0
XGBoost 1
2
xgb
3
Light Gradient Boosting Machine (Light GBM) LGBMClassifier(random_state=42) 0
Light Gradient Boosting Machine (Light GBM) 1
2
lightbg
3
