In [1]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 

In [2]:
# load our datasets 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('Submission.csv')

In [3]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
train.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [5]:
# from this we can see that item_weight has missing values 
# let us look at categorical data for our dataset 


In [6]:
train.describe(include=['O'])

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [7]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
train['Item_Weight']=train['Item_Weight'].fillna(train['Item_Weight'].mean())
test['Item_Weight']=test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [9]:
train['Outlet_Size']= train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])
test['Outlet_Size']= test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0])

In [10]:
#checking again for nulll values in our dataset 
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [11]:
# clear to go 

In [12]:
#let us have a look at categorical values and their distro
train['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [13]:
train.Item_Fat_Content.replace(to_replace =['LF','low fat'], value='Low Fat',inplace=True)
train.Item_Fat_Content.replace(to_replace ='reg',value='Regular',inplace=True)

test.Item_Fat_Content.replace(to_replace =['LF','low fat'], value='Low Fat',inplace=True)
test.Item_Fat_Content.replace(to_replace ='reg',value='Regular',inplace=True)

In [14]:
train['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [15]:
#let us prepare our model 
# since this dataset seems to have categorical data , I will use CatBoostRegressor for this problem
# let us begin by having our categorical columns in order 
train.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [16]:
# separating our datasets 
X= train.drop('Item_Outlet_Sales',axis=1)
y= train['Item_Outlet_Sales']

In [17]:
categorical_feature_indices = np.where(X.dtypes == 'object')[0]
print(categorical_feature_indices)

[ 0  2  4  6  8  9 10]


In [18]:
# split our data using train_test_split
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test

In [19]:
from catboost import CatBoostRegressor, Pool, metrics, cv

In [20]:
# Model training 
model = CatBoostRegressor(
    loss_function='RMSE',
    random_seed=42,
    logging_level='Silent'
)

In [21]:
model.fit(
    X_train, y_train,
    cat_features=categorical_feature_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [22]:
cv_params = model.get_params()
print(cv_params)

{'loss_function': 'RMSE', 'random_seed': 42, 'logging_level': 'Silent'}


In [24]:
## attempting cross-validation


cv_data = cv(
    Pool(X, y, cat_features=categorical_feature_indices),
    cv_params,
    nfold = 5,
    iterations= 500,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [25]:
cv_data.head()

Unnamed: 0,iterations,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,2702.348341,42.946472,2702.467196,10.955598
1,1,2638.192555,41.738802,2638.038734,11.885251
2,2,2575.793164,42.199695,2576.090713,11.432988
3,3,2515.937221,42.793314,2516.463783,11.426668
4,4,2457.728233,43.363341,2458.528591,10.829162


In [26]:
print('Best validation RMSE score is: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-RMSE-mean']),
    cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])],
    np.argmax(cv_data['test-RMSE-mean'])
))

Best validation RMSE score is: 2702.35±42.95 on step 0


In [27]:
print('Precise  RMSE score: {}'.format(np.max(cv_data['test-RMSE-mean'])))

Precise  RMSE score: 2702.348340644839


In [29]:
submission.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1777.886973
1,FDW14,OUT017,1268.269047
2,NCN55,OUT010,665.141935
3,FDQ58,OUT017,2486.532016
4,FDY38,OUT027,1364.699084


In [30]:
# Model applying 
predictions = model.predict(X_test)

In [31]:
submission = pd.DataFrame(data={
                                'Item_Identifier':test['Item_Identifier'],
                                'Outlet_Identifier':test['Outlet_Identifier'],
                                'Item_Outlet_Sales': predictions})

submission.to_csv('Submi.csv',index=False)
