In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import ppscore as pps
import statsmodels.formula.api as sm

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.stats import mode

from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer


np.set_printoptions(threshold=np.inf)

In [8]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

df = pd.concat([train, test])
len(train)

8523

In [9]:
piv = pd.pivot_table(df,  columns=['Outlet_Type'], values=['Outlet_Size'], aggfunc=(lambda x:mode(x)[0]) )
missing_rows = df['Outlet_Size'].isnull() 
df.loc[missing_rows,'Outlet_Size'] = df.loc[missing_rows,'Outlet_Type'].apply(lambda x: piv[x])

In [10]:
piv_weight = df.pivot_table(values = 'Item_Weight', columns='Item_Identifier')

In [11]:
missing_rows = df['Item_Weight'].isnull()

In [12]:
df.loc[missing_rows, 'Item_Weight'] = df.loc[missing_rows, 'Item_Identifier'].apply(lambda row: piv_weight[row])

In [13]:
fat_content_dict = {'LF': 'LF', 'low fat': 'LF', 'Low Fat': 'LF', 'reg': 'R', 'Regular': 'R'}

In [14]:
df.loc[:, 'Item_Fat_Content'] = df.loc[:, 'Item_Fat_Content'].apply(lambda row: fat_content_dict[row])

In [15]:
class_dict = {'FD': 'Food Items', 'DR': 'Drinks', 'NC': 'Non Consumables'}
df['Item_Class'] = df['Item_Identifier'].apply(lambda row: row[:2]).map(class_dict)

In [16]:
df.drop(columns=['Item_Identifier', 'Outlet_Identifier'], inplace = True)

In [17]:
categ_cols = ['Item_Fat_Content','Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Class']
num_col = ['Item_Weight', 'Item_Visibility', 'Item_MRP' ,'Outlet_Establishment_Year', 'Item_Outlet_Sales']

cat_df = df[categ_cols]
num_df = df[num_col]

transformer = ColumnTransformer(transformers=[("OneHot",OneHotEncoder(),categ_cols)],remainder='passthrough')
cat_df = transformer.fit_transform(cat_df).toarray()

mmscaler = MinMaxScaler()
num_df = mmscaler.fit_transform(num_df)

In [19]:
X = np.append(num_df, cat_df, axis = 1)

In [22]:
y_train = df.iloc[:8523, -2:-1]
y_train

Unnamed: 0,Item_Outlet_Sales
0,3735.1380
1,443.4228
2,2097.2700
3,732.3800
4,994.7052
...,...
8518,2778.3834
8519,549.2850
8520,1193.1136
8521,1845.5976


In [23]:
X_train, X_test = X[:8523], X[8523:]

In [27]:
xgbreg = XGBRegressor()
xgbreg.fit(X_train, y_train)
y_pred = xgbreg.predict(X_test)

In [28]:
print(y_pred)

[38.404606 38.241196 41.795265 38.86504  40.568787 40.045097 37.900288
 38.068504 37.38921  40.445084 40.7146   38.152714 40.250088 38.557194
 38.713352 39.92133  39.069424 39.269016 39.86556  38.964695 39.578804
 40.81548  37.96795  38.73393  38.898384 39.442463 38.62454  41.8625
 41.047455 38.64966  35.182434 39.046932 37.49124  38.95393  39.707233
 42.71657  37.880157 39.526913 39.505745 37.305614 37.725765 40.61018
 39.87117  39.95728  39.545547 39.611206 37.872894 39.41186  40.802536
 39.03468  38.886993 38.29816  39.34981  40.0569   39.47691  39.312454
 39.234985 37.765255 39.719307 38.15439  39.116528 39.454372 39.30946
 36.743515 42.551422 37.822266 39.58343  42.108524 40.038395 40.884884
 39.581295 39.626827 41.37205  40.40144  40.94269  39.02923  38.680523
 39.008724 41.24053  39.0877   38.963875 37.06968  39.241627 38.86504
 39.821377 41.037796 39.97418  41.114468 37.71058  40.75758  40.111584
 39.78449  39.55405  37.551968 40.21309  39.134792 38.48925  38.81642
 40.03075  3