In [642]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import fuzzywuzzy
from fuzzywuzzy import process
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
%matplotlib inline


In [643]:
train_data=pd.read_csv("Train.csv")
test_data=pd.read_csv("Test.csv")

In [644]:
x_train_data=train_data.drop("Item_Outlet_Sales",axis=1)
y_train_data=train_data["Item_Outlet_Sales"]

In [645]:
count_item=x_train_data["Item_Identifier"].value_counts()
threshold2=1

In [646]:
repl=count_item[count_item<=1].index

In [647]:
x_train_data["Item_Identifier"]=x_train_data["Item_Identifier"].replace(repl, "Other_Item_ID")

In [648]:
test_data["Item_Identifier"]=test_data["Item_Identifier"].replace(repl, "Other_Item_ID")

In [649]:
x_train_data["Item_Identifier"].value_counts()

FDW13    10
FDG33    10
FDF56     9
FDW26     9
FDP25     9
         ..
DRC24     2
NCG19     2
FDZ50     2
DRI59     2
NCW05     2
Name: Item_Identifier, Length: 1551, dtype: int64

# ITEM IDENTIFIER

In [650]:
x_train_data["Item_Identifier"]=x_train_data["Item_Identifier"].apply(lambda s:s[0:2])

In [651]:
test_data["Item_Identifier"]=test_data["Item_Identifier"].apply(lambda s:s[0:2])

In [652]:
x_train_data["Item_Identifier"].value_counts()

FD    6117
NC    1599
DR     798
Ot       9
Name: Item_Identifier, dtype: int64

# ITEM WIEGHT

In [653]:
item_weight_mean=x_train_data["Item_Weight"].mean()

In [654]:
x_train_data["Item_Weight"]=x_train_data["Item_Weight"].fillna(item_weight_mean)

In [655]:
test_data["Item_Weight"]=test_data["Item_Weight"].fillna(item_weight_mean)

# ITEM FAT CONTENT

In [656]:
x_train_data["Item_Fat_Content"]=x_train_data["Item_Fat_Content"].str.lower()
x_train_data["Item_Fat_Content"]=x_train_data["Item_Fat_Content"].str.strip()

In [657]:
test_data["Item_Fat_Content"]=test_data["Item_Fat_Content"].str.lower()
test_data["Item_Fat_Content"]=test_data["Item_Fat_Content"].str.strip()

In [658]:
def replace_inconsistent_data(df,column, string_to_match, min_ratio=22):
    unique_df=df[column].unique()
    matches=fuzzywuzzy.process.extract(string_to_match, unique_df,limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio)
    close_matches=[matches[0] for matches in matches if matches[1]>=min_ratio]
    row_matches=df[column].isin(close_matches)
    df.loc[row_matches,column]=string_to_match


In [659]:
replace_inconsistent_data(df=x_train_data, column="Item_Fat_Content", string_to_match="low fat")

In [660]:
replace_inconsistent_data(df=x_train_data, column="Item_Fat_Content", string_to_match="regular")

In [661]:
replace_inconsistent_data(df=test_data, column="Item_Fat_Content", string_to_match="regular")

In [662]:
replace_inconsistent_data(df=test_data, column="Item_Fat_Content", string_to_match="low fat")

# ITEM VISIBILITY

In [663]:
x_train_data["Item_Visibility"]=x_train_data["Item_Visibility"].replace(0,float("NaN"))

In [664]:
test_data["Item_Visibility"]=test_data["Item_Visibility"].replace(0,float("NaN"))

In [665]:
item_visibility_mean=x_train_data["Item_Visibility"].mean()

In [666]:
x_train_data["Item_Visibility"]=x_train_data["Item_Visibility"].fillna(item_visibility_mean)

In [667]:
test_data["Item_Visibility"]=test_data["Item_Visibility"].fillna(item_visibility_mean)


# OUTLIERS REMOVING

In [668]:
Q1=x_train_data["Item_Visibility"].quantile(.25)
Q3=x_train_data["Item_Visibility"].quantile(.75)

In [669]:
IQR=Q3-Q1

In [670]:
upper_lim=Q3+(1.5*IQR)
lower_lim =Q1-(1.5*IQR)

In [671]:
x_train_data["Item_Visibility"]=np.where(x_train_data["Item_Visibility"]>upper_lim,upper_lim, np.where(x_train_data["Item_Visibility"]<lower_lim, lower_lim,x_train_data["Item_Visibility"]))

In [672]:
test_data["Item_Visibility"]=np.where(test_data["Item_Visibility"]>upper_lim,upper_lim, np.where(test_data["Item_Visibility"]<lower_lim, lower_lim,test_data["Item_Visibility"]))

# ITEM TYPE

In [673]:
count=x_train_data["Item_Type"].value_counts()
threshold=200

In [674]:
repl1=count[count<=200].index

In [675]:
x_train_data["Item_Type"]=x_train_data["Item_Type"].replace(repl1,"Other_Item")

In [676]:
test_data["Item_Type"]=test_data["Item_Type"].replace(repl1,"Other_Item")

# OUTLET ESHTABLISHMENT YEAR

In [677]:
x_train_data["Outlet_Establishment_Year"]=2023-x_train_data["Outlet_Establishment_Year"]

In [678]:
test_data["Outlet_Establishment_Year"]=2023-test_data["Outlet_Establishment_Year"]

In [679]:
x_train_data["Outlet_Establishment_Year"]=x_train_data["Outlet_Establishment_Year"].astype(float)

In [680]:
test_data["Outlet_Establishment_Year"]=test_data["Outlet_Establishment_Year"].astype(float)

In [700]:
x_train_data["Outlet_Establishment_Year"]

0       24.0
1       14.0
2       24.0
3       25.0
4       36.0
        ... 
8518    36.0
8519    21.0
8520    19.0
8521    14.0
8522    26.0
Name: Outlet_Establishment_Year, Length: 8523, dtype: float64

# OUTLET Size

In [681]:
x_train_data["Outlet_Size"]=x_train_data["Outlet_Size"].fillna("Medium")

In [682]:
test_data["Outlet_Size"]=test_data["Outlet_Size"].fillna("Medium")

# OUTLET TYPE

In [683]:
#x_train_d=x_train_data.copy()

In [684]:
x_train_data["Outlet_Type"].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [685]:
count_outlet_type=x_train_data["Outlet_Type"].value_counts()
threshold1=1000

In [686]:
repl2=count_outlet_type[count_outlet_type<=threshold1].index

In [687]:
x_train_data["Outlet_Type"]=x_train_data["Outlet_Type"].replace(repl2, "Other Outlet Type")

In [688]:
x_train_data["Outlet_Type"].unique()

array(['Supermarket Type1', 'Other Outlet Type', 'Grocery Store'],
      dtype=object)

In [689]:
test_data["Outlet_Type"]=test_data["Outlet_Type"].replace(repl2, "Other Outlet Type")

In [690]:
x_train_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [691]:
test_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

# oulet location type

In [692]:
#x_train_data=x_train_data.drop("Outlet_Location_Type", axis=1)

In [693]:
#test_data=test_data.drop("Outlet_Location_Type", axis=1)

# CLOUMN TRANSFORMER

In [694]:
transformer1=ColumnTransformer(transformers=[
    ("x_train_ordinal", OrdinalEncoder(categories=[["Small","Medium","High"]]),["Outlet_Size"]),
    ("x_train_ohe",OneHotEncoder(sparse=False,drop="first"),["Item_Identifier","Item_Fat_Content","Item_Type","Outlet_Identifier","Outlet_Location_Type","Outlet_Type"]),
    ("x_train_sclaed",StandardScaler(),["Item_Weight","Item_MRP","Outlet_Establishment_Year"]),
], remainder="passthrough")

In [695]:
x_train_processed=transformer1.fit_transform(x_train_data)

In [696]:
test_preprocessed=transformer1.transform(test_data)

In [697]:
model=LinearRegression()
model.fit(x_train_processed,y_train_data)
y_pred=model.predict(test_preprocessed)

In [698]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model, x_train_processed,y_train_data,scoring="r2",cv=10)

In [699]:
scores.mean()

0.5584822483795915