In [333]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [291]:
data = pd.read_csv('train_kaggle.csv')

  data = pd.read_csv('train_kaggle.csv')


In [292]:
data

Unnamed: 0,Customer_ID,outlet_city,luxury_sales,fresh_sales,dry_sales,cluster_catgeory
0,10493832.0,Kelaniya,1209.6,756.0,5292.0,4
1,10178643.0,Moratuwa,1590.12,1060.08,6007.12,1
2,10513916.0,Wattala,2288.88,1481.04,9155.52,4
3,10334589.0,Wattala,2274.94,1739.66,9099.76,4
4,10458365.0,Kelaniya,2345.49,2069.55,9243.99,4
...,...,...,...,...,...,...
774150,10197979.0,Gampaha,3893.4,3893.4,3448.44,3
774151,10494575.0,Katunayake,6095.86,5557.99,6275.15,3
774152,10565682.0,Colombo,5121.42,4820.16,4669.53,3
774153,10351977.0,Kandy,6311.76,6311.76,5940.48,3


In [293]:
data.dtypes

Customer_ID         float64
outlet_city          object
luxury_sales         object
fresh_sales          object
dry_sales            object
cluster_catgeory     object
dtype: object

In [294]:
df = data.copy()

In [295]:
df.isna().sum()

Customer_ID          2
outlet_city          2
luxury_sales        35
fresh_sales         41
dry_sales           30
cluster_catgeory     1
dtype: int64

In [296]:
for column in df.columns:
    print("Column:", column)
    print(df[column].value_counts())
    print()

Column: Customer_ID
Customer_ID
10493832.0    1
10305453.0    1
10129880.0    1
10677155.0    1
10434221.0    1
             ..
10177319.0    1
10045425.0    1
10372098.0    1
10066739.0    1
10637635.0    1
Name: count, Length: 774153, dtype: int64

Column: outlet_city
outlet_city
Colombo                   41031
Batticaloa                41012
Galle                     41010
Kalmunai                  40711
Negombo                   40703
Jaffna                    40675
Katunayake                40649
Kandy                     40534
Trincomalee               40528
Gampaha                   40252
Peliyagoda                37151
Nuwara Eliya              37102
Dehiwala-Mount Lavinia    36839
Kotte                     36788
Panadura                  36726
Wattala                   36721
Kaduwela                  36695
Homagama                  36639
Moratuwa                  36417
Kelaniya                  35970
Name: count, dtype: int64

Column: luxury_sales
luxury_sales
1029.6     62
14

In [297]:
df.cluster_catgeory.unique()

array(['4', '1', '99', '2', '5', '3', '6', '6\\', 4, 2, 1, 95, 3, 98, 5,
       6, nan, 100.0, 89.0], dtype=object)

In [298]:
df.loc[df["cluster_catgeory"] == '6\\', "cluster_catgeory"] = 6

In [299]:
df["cluster_catgeory"] = df["cluster_catgeory"].fillna(0)

In [300]:
df["cluster_catgeory"] = df["cluster_catgeory"].astype(int)

In [301]:
clusters = [1, 2, 3, 4, 5, 6]
df = df[df["cluster_catgeory"].isin(clusters)]

In [302]:
df["cluster_catgeory"] = df["cluster_catgeory"].astype(str)

In [303]:
df = df.dropna(subset=["outlet_city"])

In [304]:
df = df.drop("Customer_ID", axis=1)

In [305]:
def check(column):
    words = []
    for value in df[column]:
        try:
            # Attempt to convert the value to an integer
            int_value = float(value)
        except ValueError:
            # If conversion fails, add the value to the list
            #non_integer_values.append(value)
            words.append(value)
    return words

In [306]:
# mapping values in luxury_sales
luxury_words = check("luxury_sales")
luxury_nums = [1400.00, np.nan, np.nan, 800.00, 630.00, np.nan, 1200.00, 790.00, 3200.00, 4100.00]
luxury_map = dict(zip(luxury_words, luxury_nums))

In [307]:
luxury_map

{'One thousand four hundread ruppes': 1400.0,
 'nul': nan,
 'Eight hundread ruppess': 800.0,
 'six hundread and hirty ': 630.0,
 'Thousand tow hundread ': 1200.0,
 'seven hundread and nine ruppees': 790.0,
 'Three thousand two hundread ruppess': 3200.0,
 'Four thousand one hundread ruppess': 4100.0}

In [308]:
df["luxury_sales"] = df["luxury_sales"].replace(to_replace = luxury_map)

In [309]:
# mapping values in fresh_sales
fresh_words = check("fresh_sales")
fresh_nums = [605.00, np.nan, np.nan, 3500.00, np.nan, np.nan, 13000.00, 5000.00, 2700.00]
fresh_map = dict(zip(fresh_words, fresh_nums))

In [310]:
fresh_map

{'Six hundread and five ruppes': 605.0,
 'nul': nan,
 'Three thousana and five hundread': 3500.0,
 'thirteen thousand ruppes': 13000.0,
 'Five thousand ruppes': 5000.0,
 'Two thousand seven hundread ruppess': 2700.0}

In [311]:
df["fresh_sales"] = df["fresh_sales"].replace(to_replace = fresh_map)

In [312]:
# mapping values in dry_sales
dry_words = check("dry_sales")
dry_nums = [4200.00, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
dry_map = dict(zip(dry_words, dry_nums))

In [313]:
dry_map

{'Four thousand and two hundread ruppes': 4200.0, 'nul': nan}

In [314]:
df["dry_sales"] = df["dry_sales"].replace(to_replace = dry_map)

In [315]:
df[["luxury_sales", "fresh_sales", "dry_sales"]] = df[["luxury_sales", "fresh_sales", "dry_sales"]].astype(float)

In [316]:
df.reset_index(inplace=True, drop=True)

In [317]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 774147 entries, 0 to 774146
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   outlet_city       774147 non-null  object 
 1   luxury_sales      774109 non-null  float64
 2   fresh_sales       774102 non-null  float64
 3   dry_sales         774110 non-null  float64
 4   cluster_catgeory  774147 non-null  object 
dtypes: float64(3), object(2)
memory usage: 29.5+ MB


In [318]:
df.isna().sum()

outlet_city          0
luxury_sales        38
fresh_sales         45
dry_sales           37
cluster_catgeory     0
dtype: int64

In [319]:
df.groupby("cluster_catgeory")[["luxury_sales", "fresh_sales", "dry_sales"]].transform('mean')

Unnamed: 0,luxury_sales,fresh_sales,dry_sales
0,1748.637921,1249.938813,6742.625732
1,1563.119268,2188.484420,8436.597969
2,1748.637921,1249.938813,6742.625732
3,1748.637921,1249.938813,6742.625732
4,1748.637921,1249.938813,6742.625732
...,...,...,...
774142,4070.673250,4071.420139,4069.969537
774143,4070.673250,4071.420139,4069.969537
774144,4070.673250,4071.420139,4069.969537
774145,4070.673250,4071.420139,4069.969537


In [320]:
X = df.drop("cluster_catgeory", axis=1).copy()
y = df["cluster_catgeory"]

In [321]:
X

Unnamed: 0,outlet_city,luxury_sales,fresh_sales,dry_sales
0,Kelaniya,1209.60,756.00,5292.00
1,Moratuwa,1590.12,1060.08,6007.12
2,Wattala,2288.88,1481.04,9155.52
3,Wattala,2274.94,1739.66,9099.76
4,Kelaniya,2345.49,2069.55,9243.99
...,...,...,...,...
774142,Gampaha,3893.40,3893.40,3448.44
774143,Katunayake,6095.86,5557.99,6275.15
774144,Colombo,5121.42,4820.16,4669.53
774145,Kandy,6311.76,6311.76,5940.48


In [322]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [323]:
X_train.shape, X_val.shape

((619317, 4), (154830, 4))

In [324]:
print(X_train.isna().sum()) 
print()
print(X_val.isna().sum())

outlet_city      0
luxury_sales    25
fresh_sales     34
dry_sales       31
dtype: int64

outlet_city      0
luxury_sales    13
fresh_sales     11
dry_sales        6
dtype: int64


In [325]:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
one_hot = OneHotEncoder(sparse_output=False)
encoder = LabelEncoder()

In [326]:
numerical = ["luxury_sales", "fresh_sales", "dry_sales"]
categorical = ["outlet_city"]

In [327]:
X_train[numerical] = imputer.fit_transform(X_train[numerical])
X_val[numerical] = imputer.transform(X_val[numerical])

In [328]:
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_val[numerical] = scaler.transform(X_val[numerical])

In [329]:
temp = pd.DataFrame(one_hot.fit_transform(X_train[categorical]), columns=one_hot.get_feature_names_out(), index=X_train.index)
X_train = pd.concat([X_train.drop("outlet_city", axis=1), temp], axis=1)

temp = pd.DataFrame(one_hot.transform(X_val[categorical]), columns=one_hot.get_feature_names_out(), index=X_val.index)
X_val = pd.concat([X_val.drop("outlet_city", axis=1), temp], axis=1)

In [336]:
y_train = encoder.fit_transform(y_train)
y_val = encoder.fit_transform(y_val)

# training

In [352]:
def eval(model, y_val=y_val, y_train=y_train, X_val=X_val, X_train=X_train):
    
    y_pred_val = model.predict(X_val)
    y_pred_train = model.predict(X_train)

    val_acc = accuracy_score(y_val, y_pred_val)
    train_acc  = accuracy_score(y_train, y_pred_train)

    row = {'val_acc': val_acc,
      'train_acc': train_acc}

    return row

## xgboost

In [337]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [348]:
params = {
    'max_depth': 3,
    'eta': 0.1,
    'gamma': 0,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': 'merror'
}

model = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=500,
    evals=[(train, 'train') , (valid, 'valid')],
    early_stopping_rounds=50
)

[0]	train-merror:0.16709	valid-merror:0.16853
[1]	train-merror:0.16918	valid-merror:0.17097
[2]	train-merror:0.13467	valid-merror:0.13488
[3]	train-merror:0.12665	valid-merror:0.12780
[4]	train-merror:0.12864	valid-merror:0.12924
[5]	train-merror:0.11917	valid-merror:0.12000
[6]	train-merror:0.10354	valid-merror:0.10437
[7]	train-merror:0.08792	valid-merror:0.08826
[8]	train-merror:0.09008	valid-merror:0.09017
[9]	train-merror:0.08159	valid-merror:0.08250
[10]	train-merror:0.07418	valid-merror:0.07502
[11]	train-merror:0.07256	valid-merror:0.07330
[12]	train-merror:0.06035	valid-merror:0.06063
[13]	train-merror:0.05799	valid-merror:0.05838
[14]	train-merror:0.05730	valid-merror:0.05762
[15]	train-merror:0.05432	valid-merror:0.05466
[16]	train-merror:0.05298	valid-merror:0.05343
[17]	train-merror:0.05131	valid-merror:0.05180
[18]	train-merror:0.04850	valid-merror:0.04914
[19]	train-merror:0.04614	valid-merror:0.04684
[20]	train-merror:0.04593	valid-merror:0.04634
[21]	train-merror:0.042

In [350]:
y_pred = model.predict(valid)

accuracy = accuracy_score(y_val, y_pred)
print("Final model accuracy:", accuracy)

Final model accuracy: 0.9997545695278692


In [354]:
eval(model, X_train=train, X_val=valid)

{'val_acc': 0.9997545695278692, 'train_acc': 0.9997723298407762}

In [None]:
def objective(params):
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=20
    )
    y_pred = booster.predict(valid)
    accuracy = accuracy_score(y_val, y_pred)

    return {'loss': -accuracy, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'multi:softprob',
    'num_class': 6
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)