# Evaluated Machine Learning Regression Model for Beginners

### Importing the basic Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Data Importing as train and test

In [None]:
train = pd.read_csv("/kaggle/input/bigmart-sales-data/Train.csv")
test = pd.read_csv("/kaggle/input/bigmart-sales-data/Test.csv")

In [None]:
train.shape, test.shape

In [None]:
test.info()

**To do examine the Data, we need to concatanate**

In [None]:
data = pd.concat([train,test],sort=False)

In [None]:
data.head()

In [None]:
data.info() #There are missing values and we need to fill or drop them 

*Item Outlet Sales will be our label for regression algorithm*

In [None]:
sns.distplot(data["Item_Outlet_Sales"])
plt.show()

Now, let's check categorical and numerical values

In [None]:
#Train
categorical = train.select_dtypes(include = [np.object])
print(categorical.shape)
numerical = train.select_dtypes(include = [np.float64,np.int64,np.int32])
print(numerical.shape)

In [None]:
#Test
categorical = test.select_dtypes(include = [np.object])
print(categorical.shape)
numerical = test.select_dtypes(include = [np.float64,np.int64,np.int32])
print(numerical.shape)

### **DATA CLEANING**
* *Missing Values*
* *Outlier Detection*
* *Feature Scaling - Standardization, - Normalization*

In [None]:
train.isna().sum()

- In the "Item_Weight" label exist 1463 missing value
- In the "Outlet_Size" label exist 2410 missing value
- Let's fix them

In [None]:
test.isna().sum() #Similar missing values exit in test data too.

In [None]:
train["Item_Weight"] = train["Item_Weight"].fillna(train["Item_Weight"].mean())
test["Item_Weight"] = test["Item_Weight"].fillna(test["Item_Weight"].mean())

In [None]:
train["Item_Weight"].isna().sum() , test["Item_Weight"].isna().sum()

We fill missing values in the "Item_Weight" column by using "median"

In [None]:
data.info()

As we see,
- Item_Weight column is float (numerical) format that's why we could use median method.
- Outlet_Size column is object form. So we need to use different method to fill them

In [None]:
train.Outlet_Size.value_counts()

In [None]:
test.Outlet_Size.value_counts()

For categorical nan values we can use "mode" method it means filling with most common value

In [None]:
train["Outlet_Size"] = train["Outlet_Size"].fillna(train["Outlet_Size"].mode()[0])
test["Outlet_Size"] = train["Outlet_Size"].fillna(test["Outlet_Size"].mode()[0])

In [None]:
print("Missing value quantity of train data:",train["Outlet_Size"].isna().sum())
print("Missing value quantity of test data:",test["Outlet_Size"].isna().sum())


*We fixed the whole nan values*

### Exploratory Data Analysis (EDA) 

In [None]:
train.columns

In [None]:
train.info()

In [None]:
train["Item_Identifier"].value_counts() #We can not detect any irregularity for that column
#try the others and checking irregularities

In [None]:
train["Item_Fat_Content"].value_counts()

- There are Low Fat, low fat also, LF lets merge them
- There are reg also Regular too, we need to merge them too.


In [None]:
train['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True) 
#By doing this we merged them as one feature

In [None]:
train["Item_Fat_Content"].value_counts()

In [None]:
test['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)

In [None]:
train["Outlet_Identifier"].value_counts() #There is nothing anormal

In [None]:
train["Item_Type"].value_counts() #Nothing anormal too

In [None]:
train["Outlet_Size"].value_counts()

In [None]:
train["Outlet_Location_Type"].value_counts()

In [None]:
train["Outlet_Type"].value_counts()

- Now, I wanna add a column which shows us how many years passed the item reported on the system 
- (Note: The time that ı wrote this notebook is 2021)


In [None]:
train["Duration"] = train["Outlet_Establishment_Year"].apply(lambda i:2021 - i)
test["Duration"] = test["Outlet_Establishment_Year"].apply(lambda i:2021 - i)
train["Duration"]= train["Duration"].astype("str")
test["Duration"]= test["Duration"].astype("str")

In [None]:
train.head(10)

In [None]:
train.info()

### Visualizations to see quantities of the values - Unvariate Data Analysis
- We just do for the object columns to see how many of them exist

But firstly how many object format column worth to visualize


In [None]:
#for i in train.columns:
#    if train.columns[i].astype == 'O':
#        print("For {} column unique value amount is : {}".format(train.columns[i],train.columns[i].unique()))
#    else:
#        break

In [None]:
train.info()

In [None]:
print("For Item_Identifier:",train["Item_Identifier"].unique())
print("For Item_Fat_Content:",train["Item_Fat_Content"].unique())
print("For Item_Type:",train["Item_Type"].unique())
print("For Outlet_Identifier:",train["Outlet_Identifier"].unique())
print("For Outlet_Size :",train["Outlet_Size"].unique())
print("For Outlet_Location_Type:",train["Outlet_Location_Type"].unique())
print("For Outlet_Type:",train["Outlet_Type"].unique())
print("For Duration:",train["Duration"].unique())

**what makes sense to visualize which columns have few variables**
- Item_Fat_Content
- Outlet_Identifier
- Outlet_Size
- Outlet_Location_Type
- Outlet_Type
- Duration

- It means that except Item_Identifer and Item_Type columns, visualization process might help to analyze the data

In [None]:
cols = train[["Item_Fat_Content","Outlet_Identifier","Outlet_Size","Outlet_Location_Type","Outlet_Type","Item_Type","Duration"]]
for i in cols:
    plt.figure(figsize=(22,10))
    ax = sns.countplot(cols[i],palette = "CMRmap")
    print(cols[i].value_counts())
    ax.set(ylabel = "COUNT") 

#### INFERENCES From the Unvariate Visualizations
- Low Fat people recorded as a costumer more than the Regular ones, 
- The medium size Outlets are more than the others
- Tier 3 Outlets has the majority in the cities
- As a outlet type Type 1 Supermarkets widely positioned than the others 
- The best stocked item types are Fruits, Vegetables and Snacks
- And lastly, most of the outlets have established and stil working for 35 years

#### Outlier Detection

In [None]:
from collections import Counter
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
train.loc[detect_outliers(train,["Item_Weight","Item_Visibility","Item_MRP","Outlet_Establishment_Year",
                                    "Item_Outlet_Sales"])]

In [None]:
test.loc[detect_outliers(test,["Item_Weight","Item_Visibility","Item_MRP","Outlet_Establishment_Year"])]

- We checked the outlier value but as you see there is nothing
- Reminder : We just detect outliers in datas which types are int,float or any numerical type

### Visualizations to see sales amount relations with variables- Bivariate Data Analysis
- We just do for the object columns to see how many of them affect the sales

In [None]:
cols = train[["Item_Fat_Content","Outlet_Identifier","Outlet_Size","Outlet_Location_Type","Outlet_Type","Item_Type","Duration"]]
for i in cols:
    plt.figure(figsize=(22,7))
    ax = sns.barplot(cols[i],data["Item_Outlet_Sales"],palette = "CMRmap") 

#### INFERENCES From the Bivariate Visualizations
- Low Fat people have reported more than the Regular one but as we see when we investigate the sales, regular ones bought items more than low fat ones , 
- The medium size Outlets are more than the others but about sales, In the High Outlets sales numbers are better
- Tier 3 Outlets has the majority in the cities, but best sales numbers recorded in Tier 2 Outlets 
- As a outlet type Type 1 Supermarkets widely positioned than the others but about sales Tier 3 Supermarkets have the best result
- The most stocked item types are Fruits, Vegetables and Snacks but best seller is Starchy Foods
- And lastly, most of the outlets have established and stil working for 35 years also they are the best about sales

### Visualizations to see sales amount relations with multivariables- Multivariate Data Analysis
- We just do for the object columns (multivariate relationship) to see how many of them affect the sales

#### Firstly, investigating the correlation might help us

In [None]:
#Correlation
plt.figure(figsize = (20,20))
sns.heatmap(train.corr(), annot=True, fmt = ".3f")
plt.show()

- Item_MRP and Item_Weight is highly correlated one of them need to drop
- Item_MRP and Sales is so highly correlated
- Item_Weight and Sales correlated too 

In [None]:
plt.figure(figsize=(25,5))
sns.barplot('Item_Type','Item_Outlet_Sales',hue='Item_Fat_Content',data=train,palette='RdYlGn')
plt.legend()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot('Outlet_Location_Type','Item_Outlet_Sales',hue='Outlet_Type',data=train,palette='magma')
plt.legend()

### Feature Engineering
- Label Encoding
- Dropping useless columns
- Splitting label and train,test
- Feature Scaling

1) ***Label Encoding***
- In Machine Learning classifiers have to be numerical format that's why we do label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
colslabeled = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type']
for i in colslabeled:
    train[i] = le.fit_transform(train[i])
    
for i in colslabeled:
    test[i] = le.fit_transform(test[i])
    
xc = train[colslabeled]

In [None]:
for  i in xc:
    print("For {} column, Number of unique values :{}".format(xc[i].name,xc[i].nunique()))

In [None]:
train.head()

2) *Dropping Columns*
- "Item_Identifier",
- "Outlet_Identifier",
- "Outlet_Establishment_Year" is not useful for our model that's why we need to drop them

In [None]:
train.drop(["Item_Identifier","Outlet_Identifier","Outlet_Establishment_Year"],axis=1, inplace =True)
test.drop(["Item_Identifier","Outlet_Identifier","Outlet_Establishment_Year"],axis=1, inplace =True)

3) *Splitting Train Test Split and Label*
- We'll use our test set as a validation set, so firstly;
- We need to seperate our label ,"Item_Outlet_Sales", 
- Then we need to do train-test split

In [None]:
y=train["Item_Outlet_Sales"]
X=train.drop(["Item_Outlet_Sales"],axis=1)

In [None]:
X.head()

4) *Feature Scaling*

In [None]:
X.columns

In [None]:
features =[]
for i in X.columns:
    features.append([i])
#The columns have added to features list

In [None]:
#Train-Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

In [None]:
X_train.head()

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.fit_transform(X_test)
# #After the process, our data turns into numpy array

## Building Regression Models
- Linear Regression
- Random Forest Regressor

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
Lr = LinearRegression(normalize=True)
Lr.fit(X_train,y_train)
y_pred = Lr.predict(X_test)

*- Evaluation Metrics*
- R2
+ Adjusted R2
+ Accuracy Score
+ Mean Absolute Error
+ Mean Squared Error

In [None]:
#R2 Score
from sklearn.metrics import r2_score
R2 = r2_score(y_test,y_pred)
print("r2 score is :",R2)

In [None]:
#Adjusted R2 Score
def adj_r2 (X,y,model):
    r_squared = model.score(X,y)
    return(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1))

#Checking Adjusted R2 score of the train and test datas 
print("Adj. R2 of the train set",adj_r2(X_train,y_train,Lr))
print("Adj. R2 of the test set",adj_r2(X_test,y_test,Lr))

In [None]:
#Accuracy Score
print("Score of the train set",Lr.score(X_train,y_train))
print("Score of the test set",Lr.score(X_test,y_test))

In [None]:
#Mean Abs. Error and Mean Squared Error
from sklearn.metrics import mean_absolute_error,mean_squared_error
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
print("Mean Absolute Error :",MAE)
print("Mean Squared Error :",MSE)

*Our model is not accurate enough so let's try regularization technics*
- Lasso (L1)

In [None]:
move = np.arange(0.01,0.99,0.05)
move

In [None]:
#Lasso (L1) Regularization 
from sklearn.linear_model import Lasso
for i in move:
    lasso_model = Lasso(alpha =i)
    lasso_model.fit(X_train,y_train)
    print("Train, For alpha = {}, model score is {} ".format(i,lasso_model.score(X_train,y_train)))

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
Rf = RandomForestRegressor(n_estimators = 300,
                           criterion = "mse", 
                           max_depth =4, 
                           n_jobs = -1,
                           random_state = 42)

In [None]:
Rf.fit(X_train,y_train)
y_predrf = Rf.predict(X_test)

*- Evaluation Metrics*
- R2
+ Adjusted R2
+ Accuracy Score
+ Mean Absolute Error
+ Mean Squared Error

In [None]:
#R2 Score
from sklearn.metrics import r2_score
R2rf = r2_score(y_test,y_predrf)
print("r2 score is :",R2rf)

In [None]:
#Adjusted R2 Score
def adj_r2 (X,y,model):
    r_squared = model.score(X,y)
    return(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1))

#Checking Adjusted R2 score of the train and test datas 
print("Adj. R2 of the train set",adj_r2(X_train,y_train,Rf))
print("Adj. R2 of the test set",adj_r2(X_test,y_test,Rf))

In [None]:
#Mean Abs. Error and Mean Squared Error
from sklearn.metrics import mean_absolute_error,mean_squared_error
MAErf = mean_absolute_error(y_test,y_predrf)
MSErf = mean_squared_error(y_test,y_predrf)
print("Mean Absolute Error :",MAErf)
print("Mean Squared Error :",MSErf)

Let's compare the Results 

In [None]:
print("Linear Regression r2 score is :",R2)
print("Random Forest r2 score is :",R2rf)
print("-----------------------------------")
print("Linear Reg. Adj. R2 of the train set",adj_r2(X_train,y_train,Lr))
print("Random Forest Adj. R2 of the train set",adj_r2(X_train,y_train,Rf))
print("Linear Reg.Adj. R2 of the test set",adj_r2(X_test,y_test,Lr))
print("Random Forest Adj. R2 of the test set",adj_r2(X_test,y_test,Rf))
print("-----------------------------------")
print("Linear Regression Mean Absolute Error :",MAE)
print("Linear Regression Mean Squared Error :",MSE)
print("Random Forest Mean Absolute Error :",MAErf)
print("Random Forest Mean Squared Error :",MSErf)

### Conclsion
- **Clearly Random Forest more accurate than the Linear Regression for this problem** 
- Thanks