In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.preprocessing

plt.style.use('Solarize_Light2')

In [2]:
train_df = pd.read_csv('train.csv')
train_df['source'] = 'train'
test_df = pd.read_csv('test.csv')
test_df['source'] = 'test'
submission = pd.read_csv('submission.csv')

In [3]:
data_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
data_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [5]:
data_df.apply(lambda x: sum(x.isnull()))

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [6]:
data_df.apply(lambda x: len(x.unique()))

Item_Identifier               1559
Item_Weight                    416
Item_Fat_Content                 5
Item_Visibility              13006
Item_Type                       16
Item_MRP                      8052
Outlet_Identifier               10
Outlet_Establishment_Year        9
Outlet_Size                      4
Outlet_Location_Type             3
Outlet_Type                      4
Item_Outlet_Sales             3494
source                           2
dtype: int64

In [7]:
categorical_features = [x for x in data_df.dtypes.index if data_df.dtypes[x] == 'object']
categorical_features = [x for x in categorical_features if x not in ['source', 'Item_Identifier', 'Outlet_Identifier']]

for feature in categorical_features:
    print('Freq of categories for feature', feature)
    print(data_df[feature].value_counts(), end='\n\n')

Freq of categories for feature Item_Fat_Content
Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

Freq of categories for feature Item_Type
Fruits and Vegetables    2013
Snack Foods              1989
Household                1548
Frozen Foods             1426
Dairy                    1136
Baking Goods             1086
Canned                   1084
Health and Hygiene        858
Meat                      736
Soft Drinks               726
Breads                    416
Hard Drinks               362
Others                    280
Starchy Foods             269
Breakfast                 186
Seafood                    89
Name: Item_Type, dtype: int64

Freq of categories for feature Outlet_Size
Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

Freq of categories for feature Outlet_Location_Type
Tier 3    5583
Tier 2    4641
Tier 1    3980
Name: Outlet_Location_Type, dtype: int64

Freq of categories for

In [8]:
# Item weight missing values
item_weight_avg = data_df.groupby('Item_Identifier')[['Item_Weight']].mean()
miss_bool = data_df['Item_Weight'].isnull()
data_df.loc[miss_bool, 'Item_Weight'] = data_df.loc[miss_bool, 'Item_Identifier'].apply(lambda x: item_weight_avg.loc[x])

In [9]:
# Outlet size missing values
outlet_size_mode = data_df.groupby('Outlet_Type')[['Outlet_Size']].agg(lambda x: x.value_counts().index[0])
miss_bool = data_df['Outlet_Size'].isnull()
data_df.loc[miss_bool, 'Outlet_Size'] = data_df.loc[miss_bool, 'Outlet_Type'].apply(lambda x: outlet_size_mode.loc[x])

In [10]:
data_df.apply(lambda x: sum(x.isnull()))

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [11]:
# Change item_visibility from 0 to mean
item_visibility_avg = data_df.groupby('Item_Identifier')[['Item_Visibility']].mean()
miss_bool = (data_df['Item_Visibility'] == 0)
data_df.loc[miss_bool, 'Item_Visibility'] = data_df.loc[miss_bool, 'Item_Identifier'].apply(lambda x: item_visibility_avg.loc[x])

In [12]:
# New featrure - visibility mean ration
data_df['Item_Visibility_MeanRatio'] = data_df.apply(lambda x: x['Item_Visibility'] / item_visibility_avg.loc[x['Item_Identifier']], axis=1)

In [13]:
# New feature - combined item type
data_df['Item_Type_Combined'] = data_df['Item_Identifier'].apply(lambda x: x[0:2])
data_df['Item_Type_Combined'] = data_df['Item_Type_Combined'].map({'FD': 'Food',
                                                                   'NC': 'Non-Consumable',
                                                                   'DR': 'Drinks'})
data_df['Item_Type_Combined'].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

In [14]:
# Change Item_Fat_Content feature
data_df['Item_Fat_Content'] = data_df['Item_Fat_Content'].replace({'low fat': 'Low Fat',
                                                               'LF': 'Low Fat',
                                                               'reg': 'Regular'})

In [15]:
data_df['Item_Fat_Content'].value_counts()

Low Fat    9185
Regular    5019
Name: Item_Fat_Content, dtype: int64

In [16]:
data_df.loc[data_df['Item_Type_Combined']=='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'

In [17]:
data_df['Item_Fat_Content'].value_counts()

Low Fat       6499
Regular       5019
Non-Edible    2686
Name: Item_Fat_Content, dtype: int64

In [18]:
# New feature - Outlet years
data_df['Outlet_Years'] = 2013 - data_df['Outlet_Establishment_Year']

In [19]:
label_enc = sklearn.preprocessing.LabelEncoder()
data_df['Outlet'] = label_enc.fit_transform(data_df['Outlet_Identifier'])
categ_features = ['Outlet', 'Item_Fat_Content', 'Outlet_Size', 
                  'Outlet_Location_Type', 'Outlet_Type', 'Item_Type_Combined']
label_enc = sklearn.preprocessing.LabelEncoder()
for i in categ_features:
    data_df[i] = label_enc.fit_transform(data_df[i])

In [20]:
# One Hot
data_df = pd.get_dummies(data_df, columns=['Outlet', 'Item_Fat_Content', 'Outlet_Size', 
                                           'Outlet_Location_Type', 'Outlet_Type', 'Item_Type_Combined'])

In [21]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Visibility            14204 non-null  float64
 3   Item_Type                  14204 non-null  object 
 4   Item_MRP                   14204 non-null  float64
 5   Outlet_Identifier          14204 non-null  object 
 6   Outlet_Establishment_Year  14204 non-null  int64  
 7   Item_Outlet_Sales          8523 non-null   float64
 8   source                     14204 non-null  object 
 9   Item_Visibility_MeanRatio  14204 non-null  float64
 10  Outlet_Years               14204 non-null  int64  
 11  Outlet_0                   14204 non-null  uint8  
 12  Outlet_1                   14204 non-null  uint8  
 13  Outlet_2                   14204 non-null  uin

In [22]:
data_df = data_df.drop(['Item_Type', 'Outlet_Establishment_Year'], axis=1)

train_df = data_df[data_df['source'] == 'train']
test_df = data_df[data_df['source'] == 'test']

train_df = train_df.drop(['source'], axis=1)
test_df = test_df.drop(['source', 'Item_Outlet_Sales'], axis=1)

train_df.to_csv('train_modified.csv', index=False)
test_df.to_csv('test_modified.csv', index=False)

In [23]:
from sklearn import metrics

def model_run(alg, train_df, test_df):
    X_train = train_df.drop(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'], axis=1)
    y_train = train_df['Item_Outlet_Sales']
    X_test = test_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)
    
    alg.fit(X_train, y_train)
    train_pred = alg.predict(X_train)
    
    return alg.predict(X_test), metrics.r2_score(y_train, train_pred)

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rand_forest = RandomForestRegressor()
test_pred, r2_score = model_run(rand_forest, train_df, test_df)

In [25]:
submission = test_df[['Item_Identifier', 'Outlet_Identifier']]
submission['Item_Outlet_Sales'] = test_pred
submission.to_csv('my_submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
r2_score

0.9383114324121953