In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import fuzzywuzzy
from fuzzywuzzy import process
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
%matplotlib inline




In [2]:
train_data=pd.read_csv("Train.csv")
test_data=pd.read_csv("Test.csv")

In [3]:
X=train_data.drop("Item_Outlet_Sales",axis=1)
y=train_data["Item_Outlet_Sales"]

In [4]:
x_train_data, x_val_data, y_train_data, y_test_data=train_test_split(X,y, test_size=0.2, random_state=0)

In [5]:
x_train_data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
6404,FDA56,,Low Fat,0.008722,Fruits and Vegetables,123.5414,OUT027,1985,Medium,Tier 3,Supermarket Type3
5820,DRH25,18.70,Low Fat,0.014623,Soft Drinks,52.0324,OUT045,2002,,Tier 2,Supermarket Type1
48,FDL12,15.85,Regular,0.121633,Baking Goods,60.6220,OUT046,1997,Small,Tier 1,Supermarket Type1
4583,FDB05,,Low Fat,0.145670,Frozen Foods,247.2776,OUT019,1985,Small,Tier 1,Grocery Store
3791,FDH10,21.00,Low Fat,0.049296,Snack Foods,194.4478,OUT035,2004,Small,Tier 2,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
4373,NCN29,15.20,Low Fat,0.020280,Health and Hygiene,49.1034,OUT010,1998,,Tier 3,Grocery Store
7891,FDP21,,Regular,0.025616,Snack Foods,188.1872,OUT027,1985,Medium,Tier 3,Supermarket Type3
4859,FDO22,13.50,Regular,0.000000,Snack Foods,78.3960,OUT035,2004,Small,Tier 2,Supermarket Type1
3264,FDI52,18.70,Low Fat,0.104890,Frozen Foods,121.4072,OUT045,2002,,Tier 2,Supermarket Type1


In [6]:
x_train_data=x_train_data.drop("Outlet_Location_Type",axis=1)

In [7]:
x_train_data.sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Type
2441,NCR41,17.85,Low Fat,0.018097,Health and Hygiene,96.9094,OUT018,2009,Medium,Supermarket Type2
4553,DRL47,19.7,LF,0.038815,Hard Drinks,124.2362,OUT045,2002,,Supermarket Type1
848,FDS49,9.0,Low Fat,0.079794,Canned,80.1644,OUT017,2007,,Supermarket Type1
7787,FDV51,16.35,Low Fat,0.032539,Meat,165.7842,OUT046,1997,Small,Supermarket Type1
5903,FDI52,18.7,Low Fat,0.104658,Frozen Foods,121.2072,OUT035,2004,Small,Supermarket Type1


# ITEM IDENTIFIER

In [8]:
#count_item=x_train_data["Item_Identifier"].value_counts()
#threshold=1

In [9]:
#repl=count_item[count_item<=1].index

In [10]:
#x_train_data["Item_Identifier"]=x_train_data["Item_Identifier"].replace(repl, "Other_Item_ID")

In [11]:
#x_val_data["Item_Identifier"]=x_val_data["Item_Identifier"].replace(repl, "Other_Item_ID")

In [12]:
x_train_data["Item_Identifier"]=x_train_data["Item_Identifier"].apply(lambda s:s[0:2])

In [13]:
x_val_data["Item_Identifier"]=x_val_data["Item_Identifier"].apply(lambda s:s[0:2])

In [14]:
#x_train_data.drop("Item_Identifier", axis=1, inplace=True)

In [15]:
#x_val_data.drop("Item_Identifier", axis=1, inplace=True)

# ITEM VISIBILITY

In [16]:
x_train_data["Item_Visibility"]=x_train_data["Item_Visibility"].replace(0,float("NaN"))

In [17]:
x_val_data["Item_Visibility"]=x_val_data["Item_Visibility"].replace(0,float("NaN"))

In [18]:
#item_weight_mean=x_train_data["Item_Weight"].mean()

In [19]:
#x_train_data["Item_Weight"]=x_train_data["Item_Weight"].fillna(item_weight_mean)

In [20]:

#x_val_data["Item_Weight"]=x_val_data["Item_Weight"].fillna(item_weight_mean)

# ITEM FAT CONTENT

In [21]:
x_train_data["Item_Fat_Content"]=x_train_data["Item_Fat_Content"].str.lower()
x_train_data["Item_Fat_Content"]=x_train_data["Item_Fat_Content"].str.strip()

In [22]:
x_val_data["Item_Fat_Content"]=x_val_data["Item_Fat_Content"].str.lower()
x_val_data["Item_Fat_Content"]=x_val_data["Item_Fat_Content"].str.strip()

In [23]:
def replace_inconsistent_data(df,column, string_to_match, min_ratio=22):
    unique_df=df[column].unique()
    matches=fuzzywuzzy.process.extract(string_to_match, unique_df,limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio)
    close_matches=[matches[0] for matches in matches if matches[1]>=min_ratio]
    row_matches=df[column].isin(close_matches)
    df.loc[row_matches,column]=string_to_match


In [24]:
replace_inconsistent_data(df=x_train_data, column="Item_Fat_Content", string_to_match="low fat")

In [25]:
replace_inconsistent_data(df=x_train_data, column="Item_Fat_Content", string_to_match="regular")

In [26]:
replace_inconsistent_data(df=x_val_data, column="Item_Fat_Content", string_to_match="low fat")

In [27]:
replace_inconsistent_data(df=x_val_data, column="Item_Fat_Content", string_to_match="regular")

# outilier remove

In [28]:
Q1=x_train_data["Item_Visibility"].quantile(.25)
Q3=x_train_data["Item_Visibility"].quantile(.75)

In [29]:
IQR=Q3-Q1
upper_lim=Q3+(1.5*IQR)
lower_lim =Q1-(1.5*IQR)

In [30]:
x_train_data["Item_Visibility"]=np.where(x_train_data["Item_Visibility"]>upper_lim,upper_lim, np.where(x_train_data["Item_Visibility"]<lower_lim, lower_lim,x_train_data["Item_Visibility"]))

In [31]:
x_val_data["Item_Visibility"]=np.where(x_val_data["Item_Visibility"]>upper_lim,upper_lim, np.where(x_val_data["Item_Visibility"]<lower_lim, lower_lim,x_val_data["Item_Visibility"]))

# item type

In [32]:
count=x_train_data["Item_Type"].value_counts()
threshold=200

In [33]:
repl1=count[count<=200].index

In [34]:
x_train_data["Item_Type"]=x_train_data["Item_Type"].replace(repl1,"Other_Item")

In [35]:
x_val_data["Item_Type"]=x_val_data["Item_Type"].replace(repl1,"Other_Item")

# OUTLET ESHTABLISHMENT YEAR

In [36]:
x_train_data["Outlet_Establishment_Year"]=(2023-x_train_data["Outlet_Establishment_Year"])

In [37]:
x_val_data["Outlet_Establishment_Year"]=(2023-x_val_data["Outlet_Establishment_Year"])

# OUTLET TYPE

In [38]:
#count_outlet_type=x_train_data["Outlet_Type"].value_counts()
#threshold1=1000

In [39]:
#repl1=count_outlet_type[count_outlet_type<=threshold1].index

In [40]:
#x_train_data["Outlet_Type"]=x_train_data["Outlet_Type"].replace(repl1, "Other Outlet Type")

In [41]:
#x_val_data["Outlet_Type"]=x_val_data["Outlet_Type"].replace(repl1, "Other Outlet Type")

In [42]:
x_train_data=x_train_data.drop("Outlet_Location_Type", axis=1)

KeyError: "['Outlet_Location_Type'] not found in axis"

In [43]:
x_val_data=x_val_data.drop("Outlet_Location_Type", axis=1)

# COLUMNN TRANSFORMER

# MISSING VALUE HANDLING

In [44]:
trans1=ColumnTransformer(transformers=[
    ("Impute_Item_weight", SimpleImputer(),[1]),
    ("Imputed_Item_Visibility", SimpleImputer(),[3]),
    ("Impute_Oulet_Type",SimpleImputer(strategy="most_frequent"),[8])
], remainder="passthrough")

# ORDINAL AND ONE HOT ENCODER

In [45]:
trans2=ColumnTransformer(transformers=[
    ("Ordinal_Outlet_Type",OrdinalEncoder(categories=[["Small","Medium","High"]]),[8]),
    ("Ohe_encoder",OneHotEncoder(sparse=False,drop="first",handle_unknown="ignore"),[0,2,4,6,9]),
],remainder="passthrough")

# SCALING FOR LINEAR REGRESSION

In [46]:
trans3=ColumnTransformer(transformers=[
    ("x_train_sclaed",StandardScaler(),slice(0,9))
])

# FEATURE SELECTION

In [47]:
trans4=SelectKBest(score_func=chi2, k=7)

# TRAIN LINEAR MODEL

In [48]:
trans5=LinearRegression()

# PIPE LINE

In [49]:
pipe=Pipeline([
    ("trans1",trans1),
    ("trans2",trans2),
    ("trans3",trans3),
    ("trans4",trans4),
    ("trans5",trans5)
])

In [50]:
pipe.fit(x_train_data,y_train_data)

ValueError: Found unknown categories [36, 38, 14, 16, 19, 21, 24, 25, 26] in column 0 during fit

In [None]:
y_pred_pipe=pipe.predict("x_val_data")

# r2 and Cross val

In [None]:
r2_score(y_test_data,y_pred_pipe)

In [None]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(trans5, x_train_data,y_train_data,scoring="r2",cv=10)

In [None]:
scores.mean()

# HYPER PERAMETER TUNING FOR RANDOM FORTEST

In [253]:
n_estimators=[int(i) for i in np.linspace(start=100, stop=500, num=5)]
max_features=["auto","sqrt","log2"]
max_depth=[int(i) for i in np.linspace(10,300,5)]
min_samples_split=[15,25,35]
min_samples_leaf=[3,5,7,9,15]
random_grid={"n_estimators":n_estimators,
"max_features":max_features,
"max_depth":max_depth,
"min_samples_split":min_samples_split,
"min_samples_leaf":min_samples_leaf,
"criterion":["squared_error","absolute_error","poisson"]
}

In [255]:
random_model=RandomForestRegressor()
random_randomcv=RandomizedSearchCV(estimator=random_model,param_distributions=random_grid, n_iter=20,cv=5, verbose=2,random_state=42)
random_randomcv.fit(x_train_random, y_train_data)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END criterion=poisson, max_depth=300, max_features=auto, min_samples_leaf=5, min_samples_split=25, n_estimators=500; total time=  42.6s
[CV] END criterion=poisson, max_depth=300, max_features=auto, min_samples_leaf=5, min_samples_split=25, n_estimators=500; total time=  57.9s
[CV] END criterion=poisson, max_depth=300, max_features=auto, min_samples_leaf=5, min_samples_split=25, n_estimators=500; total time=  37.5s
[CV] END criterion=poisson, max_depth=300, max_features=auto, min_samples_leaf=5, min_samples_split=25, n_estimators=500; total time=  27.0s
[CV] END criterion=poisson, max_depth=300, max_features=auto, min_samples_leaf=5, min_samples_split=25, n_estimators=500; total time=  26.9s
[CV] END criterion=squared_error, max_depth=227, max_features=log2, min_samples_leaf=7, min_samples_split=25, n_estimators=100; total time=   0.3s
[CV] END criterion=squared_error, max_depth=227, max_features=log2, min_samples_leaf=7

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'poisson'],
                                        'max_depth': [10, 82, 155, 227, 300],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [3, 5, 7, 9, 15],
                                        'min_samples_split': [15, 25, 35],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   random_state=42, verbose=2)

In [256]:
random_randomcv.best_params_

{'n_estimators': 400,
 'min_samples_split': 15,
 'min_samples_leaf': 15,
 'max_features': 'auto',
 'max_depth': 155,
 'criterion': 'absolute_error'}

In [None]:
transformer=ColumnTransformer(transformers=[
    ("x_train_ordinal", OrdinalEncoder(categories=[["Small","Medium","High"]]),["Outlet_Size"]),
    ("x_train_ohe",OneHotEncoder(sparse=False,drop="first"),["Item_Identifier","Item_Fat_Content","Item_Type","Outlet_Identifier","Outlet_Type"]),
    ("x_train_sclaed",StandardScaler(),["Item_Weight","Item_MRP","Outlet_Establishment_Year"]),
], remainder="passthrough")

In [None]:
x_train_processced=transformer.fit_transform(x_train_data)
x_val_processed=transformer.transform(x_val_data)
model=LinearRegression()
model.fit(x_train_processced,y_train_data)
y_pred=model.predict(x_val_processed)
r2_score(y_test_data, y_pred)

In [None]:
random_model=RandomForestRegressor(n_estimators=400,min_samples_split=15,min_samples_leaf=25,max_features="auto",max_depth=155,criterion="absolute_error" )
random_model.fit(x_train_random, y_train_data)
y_random_pred=random_model.predict(x_test_random)