In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [19]:
train_set_file = 'Train.csv'
test_set_file = 'Test.csv'

data_train = pd.read_csv(train_set_file)
data_test = pd.read_csv(test_set_file)


In [20]:
print(data_train.shape)
print(data_train.info())
data_train.head()

(8523, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
None


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [21]:
print(data_test.shape)
print(data_test.info())
data_test.head()

(5681, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB
None


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [22]:
target = data_train['Item_Outlet_Sales']
data_train.drop('Item_Outlet_Sales', axis = 1, inplace = True)

data = pd.concat([data_train, data_test], ignore_index = True)

print(data.shape)

(14204, 11)


In [23]:
print(data_test.shape[0]/data.shape[0])


0.3999577583779217


In [24]:
data['Item_Fat_Content'].replace({'LF': 'low fat', 'Low Fat' : 'low fat', 'reg': 'regular', 'Regular': 'regular'}, inplace=True)
data['Item_Fat_Content'].unique()


array(['low fat', 'regular'], dtype=object)

In [25]:
num_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
num_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

In [26]:
cat_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [27]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [28]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}
X_train = data[:8523]
X_test = data[8523:14204]

In [33]:
# Applying Soft Voting to fill the target of the test set
from sklearn.metrics import r2_score

test_predict_df = pd.DataFrame(index = X_test.index)
train_pred_df = pd.DataFrame(index = X_train.index)

for name, model in models.items():
  clf = Pipeline(steps = [('preprocessing', preprocessor), ('classifier', model)])
  clf.fit(X_train, target)
  test_predict_df[name] = clf.predict(X_test)
  train_pred_df[name] = clf.predict(X_train)

soft_voting_predicts = test_predict_df.mean(axis = 1)
soft_voting_predict_train = train_pred_df.mean(axis = 1)

r2_train = r2_score(target, soft_voting_predict_train)
print("Training R^2:", r2_train)

X_test['Item_Outlet_Sales'] = soft_voting_predicts

X_test.head()


Training R^2: 0.8235365782208853


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Item_Outlet_Sales'] = soft_voting_predicts


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8523,FDW58,20.75,low fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,1553.827808
8524,FDW14,8.3,regular,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1,1282.455001
8525,NCN55,14.6,low fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store,1130.730346
8526,FDQ58,7.315,low fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1,2268.19754
8527,FDY38,,regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,6005.073719
