In [57]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import ppscore as pps
import statsmodels.formula.api as sm
import xgboost as xgb

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from scipy.stats import mode

from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer


np.set_printoptions(threshold=np.inf)

In [8]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

df = pd.concat([train, test])
len(train)

8523

In [3]:
df = pd.read_csv('Train.csv')

In [4]:
piv = pd.pivot_table(df,  columns=['Outlet_Type'], values=['Outlet_Size'], aggfunc=(lambda x:mode(x)[0]) )
missing_rows = df['Outlet_Size'].isnull() 
df.loc[missing_rows,'Outlet_Size'] = df.loc[missing_rows,'Outlet_Type'].apply(lambda x: piv[x])

In [11]:
piv_weight = df.pivot_table(values = 'Item_Weight', columns='Item_Identifier')

In [None]:
total_weight_mean = df['Item_Weight'].dropna().values.mean()
total_weight_mean

In [12]:
for i in df['Item_Identifier'].values:
    if i not in piv_weight.columns:
        print(i)

FDN52
FDK57
FDE52
FDQ60


In [13]:
df.loc[(df['Item_Identifier'] == 'FDN52')|(df['Item_Identifier'] == 'FDE52')|(df['Item_Identifier'] == 'FDK57')|(df['Item_Identifier'] == 'FDQ60'), 'Item_Weight'] = total_weight_mean

In [14]:
missing_rows = df['Item_Weight'].isnull()

In [15]:
df.loc[missing_rows, 'Item_Weight'] = df.loc[missing_rows, 'Item_Identifier'].apply(lambda row: piv_weight[row])

In [16]:
fat_content_dict = {'LF': 'LF', 'low fat': 'LF', 'Low Fat': 'LF', 'reg': 'R', 'Regular': 'R'}

In [17]:
df.loc[:, 'Item_Fat_Content'] = df.loc[:, 'Item_Fat_Content'].apply(lambda row: fat_content_dict[row])

In [18]:
class_dict = {'FD': 'Food Items', 'DR': 'Drinks', 'NC': 'Non Consumables'}
df['Item_Class'] = df['Item_Identifier'].apply(lambda row: row[:2]).map(class_dict)

In [19]:
df.drop(columns=['Item_Identifier', 'Outlet_Identifier'], inplace = True)

In [21]:
categ_cols = ['Item_Fat_Content','Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Class']
num_col = ['Item_Weight', 'Item_Visibility', 'Item_MRP' ,'Outlet_Establishment_Year', 'Item_Outlet_Sales']

cat_df = df[categ_cols]
num_df = df[num_col]

transformer = ColumnTransformer(transformers=[("OneHot",OneHotEncoder(),categ_cols)],remainder='passthrough')
cat_df = transformer.fit_transform(cat_df).toarray()

mmscaler = MinMaxScaler()
num_df = mmscaler.fit_transform(num_df)

In [22]:
X = np.append(num_df, cat_df, axis = 1)

In [39]:
y = df.iloc[:, -2:-1].values

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0 )

In [73]:
grid = {
    'eta': .1,
    'gamma': 0,
    'max_depth': 4,
    'subsample': 0.5,
    'reg_lambda': 0.1,
    'alpha': 0.5
}

In [59]:
xgbreg = XGBRegressor()

In [62]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [78]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=grid, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=0)

In [80]:
cv_results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2492.998291,12.799372,2492.755615,26.65862
1,2244.870768,11.230646,2244.802572,24.047221
2,2021.550578,10.052089,2021.458008,20.84837
3,1820.695557,8.98665,1820.31783,18.427176
4,1639.402303,8.060958,1639.024943,16.444024
