# Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Loading data 

In [2]:
train = pd.read_csv('C:/Users/ankus/Downloads/9961_14084_bundle_archive/Train.csv')
test_data = pd.read_csv('C:/Users/ankus/Downloads/9961_14084_bundle_archive/test.csv')

# Know your Data

In [3]:
train.shape

(8523, 12)

In [4]:
test.shape

NameError: name 'test' is not defined

In [None]:
train.info()

In [None]:
train.describe(include='all')

In [None]:
train.head()

# Finding Missing Values

In [None]:
train.isnull().sum()

In [None]:
train.duplicated().sum()

In [None]:
train.apply(lambda x : len(x.unique()))

# Numerical and Categorical Variables

In [None]:
cat_col = train.select_dtypes(exclude='number').columns.to_list()
num_col = train.select_dtypes(include='number').columns.to_list()
print(cat_col,'\n')
print(num_col)

In [None]:
for col in cat_col:
    print('===========================')
    print('Series:','   ',col)
    print('===========================')
    print(train[col].value_counts())
    print('\n')

# Missing Values Treatment

In [None]:
clean_data = train.copy()

In [None]:
# format column names to be lower-case
new_col_names = [col.lower() for col in clean_data.columns]
clean_data.columns = new_col_names

print(clean_data.columns)

In [None]:
clean_data['item_weight'].fillna(clean_data['item_weight'].mean(),inplace=True)
clean_data.isnull().sum()

In [None]:
outlet_size_mode_pt = clean_data.pivot_table(values='outlet_size',columns='outlet_type',aggfunc=lambda x: x.mode())
outlet_size_mode_pt

In [None]:
missing_values = clean_data['outlet_size'].isnull()

In [None]:
clean_data.loc[missing_values, 'outlet_size'] = clean_data.loc[missing_values, 'outlet_type'].apply(lambda x: outlet_size_mode_pt[x].outlet_size)

In [None]:
clean_data.isnull().sum()

In [None]:
print(sum(clean_data['item_visibility'])==0)

In [None]:
clean_data['item_fat_content'].unique()

In [None]:
clean_data['item_fat_content'].replace({'low fat':'Low Fat','reg':'Regular','LF':'Low Fat'},inplace=True)

In [None]:
clean_data['item_fat_content'].value_counts()

In [None]:
clean_data['item_category'] = clean_data['item_identifier'].apply(lambda x: x[:2])
clean_data['item_category'] = clean_data['item_category'].replace({'FD':'Food', 'DR':'Drink', 'NC':'Non-Consumable'})
clean_data['item_category'].value_counts()

In [None]:
clean_data['item_category'].head(5)

In [None]:
clean_data['outlet_age'] = 2013 - clean_data['outlet_establishment_year']
clean_data['outlet_age']

In [None]:
clean_data.head()

In [None]:
clean_data.hist(figsize=(10,10))

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x='item_fat_content', data=clean_data)
plt.show()

In [None]:
labels = list(clean_data['item_type'].unique())
chart = sns.countplot(x=clean_data['item_type'])
chart.set_xticklabels(labels=labels, rotation=90)

In [None]:
# plot outlet_identifier
labels = list(clean_data['outlet_identifier'].unique())
chart = sns.countplot(x=clean_data['outlet_identifier'])
chart.set_xticklabels(labels=labels, rotation=90)

In [None]:
# plot outlet_size
plt.figure(figsize=(5,5))
sns.countplot(x='outlet_size', data=clean_data)
plt.show()

In [None]:
# plot outlet_location_type
plt.figure(figsize=(5,5))
sns.countplot(x='outlet_location_type', data=clean_data)
plt.show()

In [None]:
# plot item_category
plt.figure(figsize=(5,5))
sns.countplot(x='item_category', data=clean_data)
plt.show()

In [None]:
# outlet_establishment_year column count
plt.figure(figsize=(6,6))
sns.countplot(x='outlet_establishment_year', data=clean_data)
plt.show()

In [None]:
# correlation with the target column
corr_matrix = clean_data.corr()
corr_matrix['item_outlet_sales'].sort_values(ascending=False)

In [None]:
# plot correlations
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
# scatter plot of item_mrp and item_outlet_sales
sns.scatterplot(data=clean_data, x='item_mrp', y='item_outlet_sales')

# Data Pre-processing

### Label encoding

In [None]:
clean_data.head()

In [None]:
# applying label encoding to some features
encoder = LabelEncoder()

cols_to_encode = ['item_identifier', 'item_type', 'outlet_identifier']

for col in cols_to_encode:
    clean_data[col] = encoder.fit_transform(clean_data[col])

In [None]:
# applying one-hot encoding to some features
clean_data = pd.get_dummies(clean_data, columns=['item_fat_content', 'outlet_size', 'outlet_location_type', 'outlet_type', 'item_category'])

In [None]:
# how the data looks like
clean_data.head()

# Defining The Input Data

In [None]:
# setting model input and target variables
X = clean_data.drop(columns=['outlet_establishment_year', 'item_outlet_sales'])
y = clean_data['item_outlet_sales']

In [None]:
# df to store model error and scores
model_scores = pd.DataFrame(columns=['model', 'rmse', 'r2_score'])

In [None]:
def train_and_evaluate_model(model_name, model, X, y):
    '''
    Create a training pipeline to scale and train the model
    using the input data, then evaluate the model and safe its error
    and score in the scores dataframe.
    '''
    # split the data
    X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # create a training pipeline
    pipeline = make_pipeline(StandardScaler(), model)
    
    # apply scaling on training data and train the model
    pipeline.fit(X_train, y_train)
    
    # predict the validation set
    y_hat = pipeline.predict(X_validate)
    
    # evaluate the model
    rmse = np.sqrt(mean_squared_error(y_validate, y_hat))
    model_score = r2_score(y_validate, y_hat)
    
    # adding error and score, to the scores datafrane
    model_scores.loc[len(model_scores)] = [model_name, rmse, model_score]
    
    print('----------------------------------')
    print(model_name, ' Report:')
    print('----------------------------------')
    print('RMSE: ', rmse)
    print('R2 Score: ', model_score)

# Linear Regression

In [None]:
# linear regression
linear_regression_model = LinearRegression()
train_and_evaluate_model('Linear Regression', linear_regression_model, X, y)

# Ridge Regularization

In [None]:
ridge_model = Ridge()
train_and_evaluate_model('Ridge', ridge_model, X, y)

# Lasso Regularization

In [None]:
lasso_model = Lasso()
train_and_evaluate_model('Lasso', lasso_model, X, y)

# SVM 

In [None]:
svr_model = SVR()
train_and_evaluate_model('SVM', svr_model, X, y)

# Decision Tree

In [None]:
dtr_model = DecisionTreeRegressor()
train_and_evaluate_model('Decision Tree', dtr_model, X, y)

# Random Forest

In [None]:
rfr_model = RandomForestRegressor()
train_and_evaluate_model('Random Forest', rfr_model, X, y)

# XGBoost

In [None]:
xgbr_model = XGBRegressor()
train_and_evaluate_model('XGBoost', xgbr_model, X, y)

# Summary

In [None]:
model_scores