In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import os
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder , StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# changing the directory
os.chdir("../")
%pwd
 #read the data file
data=pd.read_csv('BIG_MART_DATA/train_v9rqX0R.csv')

In [3]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
x=data.drop(['Item_Outlet_Sales'],axis=1)
y=data['Item_Outlet_Sales']
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.20,random_state=42)



In [5]:

def item_Fat_content(data):
    data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'low fat': 'Low Fat',
                                                                'LF': 'Low Fat',
                                                                'Low fat': 'Low Fat',
                                                                'reg': 'Regular',
                                                                'Regular': 'Regular'
                                                                    })
    return data
def preprocessing(DATA_TRANSFORMATION_IMPUTER_PARAMS,categorical_imputer,numerical_cols,one_hot_cols,ordinal_cols,category):
        
        ## imputing missing values of numerical variables, converting into same units
    numerical_col=Pipeline(steps=[
        ("imputer",KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)),
        ("standscaler",StandardScaler())
    ])
        #coverting category variables into one-hot and imputing missing values
    one_hot_col = Pipeline(steps=[
    ("imputer",categorical_imputer),
    ("one_hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False,drop='first')) ## because we have only 2 outcomes eg:gender male and female 
    ])
        #coverting ordinal variables into ordinal and imputing missing values
    ordinal_col = Pipeline(steps=[
    ("imputer",categorical_imputer),
    ("ordinal", OrdinalEncoder(categories=category))
    ])
        
    pre_processor = (ColumnTransformer(transformers=[
    ("numerical", numerical_col, numerical_cols),
    ("one-hot", one_hot_col, one_hot_cols),
    ("ordinal", ordinal_col, ordinal_cols)
    ],verbose_feature_names_out=True).set_output(transform='pandas'))
        
    return pre_processor
    
             
        
        

In [6]:
DATA_TRANSFORMATION_IMPUTER_PARAMS = {
    "missing_values": np.nan,
    "n_neighbors": 3,
    "weights": "uniform"
}
categorical_imputer = SimpleImputer(strategy="most_frequent")
numerical_cols=['Item_Weight','Item_Visibility','Item_MRP','Outlet_Establishment_Year']
one_hot_cols=['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Type']
ordinal_cols=['Outlet_Size','Outlet_Location_Type']
category = [['Small', 'Medium', 'High'],['Tier 3', 'Tier 2','Tier 1']]  # each list corresponds to a feature



In [7]:
train_x=item_Fat_content(train_x)
test_x=item_Fat_content(test_x)
preprecossing=preprocessing(DATA_TRANSFORMATION_IMPUTER_PARAMS,categorical_imputer,numerical_cols,one_hot_cols,ordinal_cols,category)

In [8]:
processing_obj=preprecossing.fit(train_x)
scaled_train_x=processing_obj.transform(train_x)
scaled_test_x=processing_obj.transform(test_x)

In [9]:
preprocessed_train_data=pd.concat([scaled_train_x, train_y], axis=1)
preprocessed_test_data=pd.concat([scaled_test_x, test_y], axis=1)

In [10]:
directory = 'notebooks/pickle and trained data/'

In [11]:
preprocessed_train_data.to_csv(f'{directory}preprocessed_train_data.csv')
preprocessed_test_data.to_csv(f'{directory}preprocessed_test_data.csv')
with open(f'{directory}processing_obj.pkl', 'wb') as file:
    pickle.dump(processing_obj, file)
