In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor,plot_tree
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [2]:
train=pd.read_csv('train_v9rqX0R.csv')

In [3]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [4]:
X_train=train.drop(['Item_Outlet_Sales','Outlet_Identifier','Item_Identifier'],axis=1)
y_train=train['Item_Outlet_Sales']

In [5]:
imp_mode=SimpleImputer(strategy="most_frequent")
imp_med=SimpleImputer(strategy='median')

In [6]:
ohc=OneHotEncoder(sparse_output=False)

In [7]:
ct_imp=make_column_transformer((imp_mode,make_column_selector(dtype_include=object)),(imp_med,make_column_selector(dtype_include=['int64','float64'])),verbose_feature_names_out=False).set_output(transform="pandas")

In [8]:
ctc_enc=make_column_transformer((ohc,make_column_selector(dtype_include=object)),('passthrough',make_column_selector(dtype_include=['int64','float64'])),verbose_feature_names_out=False).set_output(transform="pandas")

In [9]:
dtr=DecisionTreeRegressor(random_state=23)

In [10]:
pipe=Pipeline([('IMPUTE',ct_imp),('ENC',ctc_enc),('TREE',dtr)])

In [11]:
pipe.fit(X_train,y_train)

In [12]:
test=pd.read_csv('test_AbJTz2l.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [13]:
X_test=test.drop(['Outlet_Identifier','Item_Identifier'],axis=1)

In [14]:
prediction=pipe.predict(X_test)

In [15]:
prediction

array([1856.2504, 1230.3984,  759.012 , ..., 1540.6612, 6471.576 ,
       2157.192 ])

In [16]:
X_test

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,20.750,Low Fat,0.007565,Snack Foods,107.8622,1999,Medium,Tier 1,Supermarket Type1
1,8.300,reg,0.038428,Dairy,87.3198,2007,,Tier 2,Supermarket Type1
2,14.600,Low Fat,0.099575,Others,241.7538,1998,,Tier 3,Grocery Store
3,7.315,Low Fat,0.015388,Snack Foods,155.0340,2007,,Tier 2,Supermarket Type1
4,,Regular,0.118599,Dairy,234.2300,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...
5676,10.500,Regular,0.013496,Snack Foods,141.3154,1997,Small,Tier 1,Supermarket Type1
5677,7.600,Regular,0.142991,Starchy Foods,169.1448,2009,Medium,Tier 3,Supermarket Type2
5678,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,2002,,Tier 2,Supermarket Type1
5679,15.300,Regular,0.000000,Canned,214.6218,2007,,Tier 2,Supermarket Type1


In [17]:
sample_submission_df=pd.read_csv('sample_submission_8RXa3c6.csv',index_col=0)

In [18]:
sample_submission_df['Item_Outlet_Sales']=prediction

In [19]:
sample_submission_df

Unnamed: 0_level_0,Outlet_Identifier,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
FDW58,OUT049,1856.2504
FDW14,OUT017,1230.3984
NCN55,OUT010,759.0120
FDQ58,OUT017,4287.7520
FDY38,OUT027,7170.6660
...,...,...
FDB58,OUT046,3829.0158
FDD47,OUT018,4512.1266
NCO17,OUT045,1540.6612
FDJ26,OUT017,6471.5760


In [20]:
sample_submission_df.to_csv('sample_submission_8RXa3c6.csv')