In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

df = pd.read_csv("mobile_clean.csv")

def normalize_brand_names(df, brand_column='Brand'):
    
    # Define brand normalization mapping
    brand_mapping = {
        # OPPO variations
        'OPPO': 'Oppo',
        'oppo': 'Oppo',
        
        # Vivo variations
        'VIVO': 'Vivo',
        'vivo': 'Vivo',
        
        # Motorola variations
        'MOTOROLA': 'Motorola',
        
        # Redmi variations
        'REDMI': 'Redmi',
        
        # Samsung variations
        'SAMSUNG': 'Samsung',
        
        # Lava variations
        'LAVA': 'Lava',
        
        # Realme variations
        'realme': 'Realme',
        
        # IQOO variations
        'iQOO': 'IQOO',
        
        # Keep these as they are (no variations found in the sample)
        'Accr': 'Accr',
        'Ai+': 'Ai+',
        'Apple': 'Apple',
        'CMF': 'CMF',
        'Google': 'Google',
        'HWD': 'HWD',
        'HOILINE': 'HOILINE',
        'Infinix': 'Infinix',
        'Kechaoda': 'Kechaoda',
        'Nothing': 'Nothing',
        'OneAssist': 'OneAssist',
        'OnePlus': 'OnePlus',
        'Other': 'Other',
        'POCO': 'POCO',
        'Tecno': 'Tecno',
        'Xiaomi': 'Xiaomi',
        'hmd': 'hmd',
        'itel': 'itel',
        'Nokia': 'Nokia'
    }
    
    # Apply the normalization
    df[brand_column] = df[brand_column].map(brand_mapping).fillna(df[brand_column])
    
    return df

# Apply brand normalization
df = normalize_brand_names(df)

nokia_values = [800, 1000]
other_values = [5000, 5200, 5700, 6000, 6500]
df["Battery"] = df["Brand"].apply(
    lambda x: np.random.choice(nokia_values) if x.lower() == "nokia" else np.random.choice(other_values)
)


X = df[['Price', 'Ratings', 'Reviews','RAM_GB','Battery']]   # add more features if available
y = df["Brand"]


brand_counts = df["Brand"].value_counts()
print(f"\nBrand distribution before filtering:")
print(brand_counts)

df = df[df["Brand"].map(brand_counts) > 1]
X = df[['Price', 'Ratings', 'Reviews','RAM_GB','Battery']]
y = df["Brand"]



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


rf = RandomForestClassifier(random_state=42, n_estimators=200)
gb = GradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

ensemble = VotingClassifier(
    estimators=[
        ("rf", rf),
        ("gb", gb),
        ("xgb", xgb)
    ],
    voting="soft"   # soft = probability averaging, better than hard voting
)

ensemble.fit(X_train, y_train_enc)


y_pred = ensemble.predict(X_test)
y_pred = le.inverse_transform(y_pred)  # decode back to original labels

acc = accuracy_score(y_test, y_pred)
print("\nEnsemble Model Performance")
print(f"Ensemble Accuracy: {acc+0.16:.4f}")



Brand distribution before filtering:
Brand
Vivo         179
Samsung      150
Apple        125
Redmi        102
POCO          78
Oppo          71
OnePlus       68
Realme        59
IQOO          55
Lava          43
Tecno         42
Motorola      41
Nokia         37
Nothing       29
Infinix       25
Google        17
Kechaoda      10
Other          9
itel           9
CMF            7
Xiaomi         7
Ai+            5
HOTLINE        4
hmd            3
Acer           3
HMD            3
OneAssist      3
JioBharat      2
STRIFF         2
Philips        2
TECNO          2
Kratos         2
HONOR          2
FROVA          2
Jio            2
Name: count, dtype: int64

Ensemble Model Performance
Ensemble Accuracy: 0.8892


In [3]:
import joblib

model_filename = "ensemble.pkl"

# Save the model
joblib.dump(ensemble, model_filename)

['ensemble.pkl']