# Problem Statement

# Hypothesis Generation

# Loading packages and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore") 
pd.options.mode.chained_assignment = None

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv('./data/Train.csv')
test_df = pd.read_csv('./data/Test.csv')

train_df['df_type'] = 'train'
test_df['df_type'] = 'test'
df = pd.concat([train_df, test_df], ignore_index=True)

# Data Structure and Content

# Exploratory Data Analysis

### Descriptive Analysis

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include='all').T

**Observations**
1. Total shape of the dataset is (14204, 13)
2. The dataset has 7 object columns (excluding df_type column) and 5 numerical columns 
3. The memory usage of the dataset is 1.4+ MB.
4. 

### Checking the unique values

In [None]:
df.apply(lambda x: len(x.unique()))

In [None]:
df['Item_Identifier'][:20]

In [None]:
cols = df.select_dtypes('object').columns.to_list()
cols = [x for x in cols if x not in ['Item_Identifier', 'df_type']]

for col in cols:
    print(f'**Column: {col}**')
    print(df[col].value_counts())
    print('')

**Observations**
1. Item_identifier - an object variable that contains 1559 unique data. Based on the data shown above, the first 2 letters seems to be the category code of the item while the last 3 is the unique id of item. Most common 2 id are FD, DR and NC. 
2. Item_Fat_Content - there are repeating data that can be combine later on: Low Fat, LF and low fat; Regular and reg
3. OUT010 and OUT019 are the least frequent outlet. These outlets have significantly smaller number of sales.
5. Most  of the outlet type, size and location type are supermarket type 1, medium and tier 3, respectively
6. The unique count of Item_MRP is greater than the number of items, this means that every an item may have different prices in every outlet 

### Missing Values

In [None]:
missingval = df.isnull().sum()
percentage_missing = 100 * df.isnull().sum()/len(train_df)
missing_table = pd.concat([missingval,percentage_missing],axis=1)
missing_table = missing_table.rename(columns = {0:'MissingValues', 1:'Percentage'})
missing_table.sort_values(by='MissingValues',ascending=False)

In [None]:
df[df['df_type']=='test'].isna().sum()

In [None]:
df[df['df_type']=='train'].isna().sum()

### Duplicate Values

In [None]:
duplicated = df.duplicated().sum()
print('Number of duplicated rows:', duplicated)

**Observation**
1. Most of the missing items in Item_Outlet_Sales are only present in testing data (Since this is the target variable)
2. Missing values are present in Item_Weight and Outlet_Size.
3. There are no duplicated rows in the dataset

# Univariate Analysis

In [None]:
plt.rcParams["figure.figsize"] = [12, 5]
plt.rcParams["figure.autolayout"] = True

### Target Variable - Item_Outlet_Sales	

In [None]:
sns.histplot(df['Item_Outlet_Sales']).set_title('Distribution of the Item Outlet Sales')
plt.show()

### Feature Variables

In [None]:
object_columns = train_df.select_dtypes('object').columns
numerical_columns = train_df.select_dtypes(['int64','float64']).columns
object_df = train_df[object_columns]
numerical_df = train_df[numerical_columns]

In [None]:
discrete_columns = [col for col in object_columns if train_df[col].nunique()>20]
object_df[discrete_columns].head()

In [None]:
object_df[discrete_columns].nunique()

In [None]:
object_df.drop(columns=['Item_Identifier', 'df_type'],inplace=True)
numerical_df.drop(columns='Item_Outlet_Sales',inplace=True)

**Categorical Variables**

In [None]:
for col in object_df:
    sns.countplot(y=col, data=object_df)
    plt.show()

**Numerical Variables**

In [None]:
for col in numerical_df:
    sns.histplot(numerical_df[col])
    plt.show()

**Observations:**

# Bivariate Analysis

### Numerical vs Numerical (Numerical Features vs Target Variable)

In [None]:
sns.pairplot(train_df)

In [None]:
plt.figure(figsize=(12,10))
sns.set(font_scale=1.1)
sns.heatmap(train_df.corr(),annot=True,cmap="Blues")
plt.show()

### Categorical vs Numerical (Object Features vs Target Variable)

In [None]:
for col in object_df.columns:
    sns.boxplot(y=train_df[col], x='Item_Outlet_Sales', data=train_df)
    plt.show()

**Observations:**

# Data Preprocessing and Feature Engineering

Based on the EDA, the following actions are need to be done:
- Fill in missing values
- Correct zero values in Item_Visibility
- Slice the first 2 letter of the Item_Identifier to get the Item_Category.
- Rename similar items in Item_Fat_Content
- Convert outlet_establishment_year into years of operation
- Encode all categorical values into numerical values
- Dropped unnecessary columns
- Split the updated data into train and test

### Missing Value Treatment

In [None]:
df.isnull().sum()

In [None]:
df['Outlet_Size'].unique()

In [None]:
df['Item_Weight']=df['Item_Weight'].fillna(df['Item_Weight'].mean())
mode_imputer = SimpleImputer(strategy='most_frequent')
df['Outlet_Size'] = mode_imputer.fit_transform(df)

In [None]:
df['Outlet_Size'].unique()

In [None]:
df.isnull().sum()

### Correct zero values in Item_Visibility

In [None]:
df[df['Item_Visibility'] == 0.0]

In [None]:
df['Item_Visibility'] = df['Item_Visibility'].replace(0,df['Item_Visibility'].mean())

In [None]:
df['Item_Visibility'].describe()

### Slice the first 2 letter of the Item_Identifier

In [None]:
df['Item_Type_Category'] = df['Item_Identifier'].astype(str).str[:2]

In [None]:
df['Item_Type_Category'].unique()

### Rename similar items in Item_Fat_Content

In [None]:
df.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
df['Item_Fat_Content'].value_counts()

In [None]:
df[df['Item_Type_Category'] == 'NC']['Item_Fat_Content'].unique()

Since there are 'NC' or Non-consumable item type category within our data, this means that these items should not have a fat content. Surprisingly most of these items are tagged as 'Low Fat' as their fat content

In [None]:
df['Item_Fat_Content'].mask(df['Item_Type_Category'] == 'NC' ,'Not Edible', inplace=True)

In [None]:
df[df['Item_Type_Category'] == 'NC']['Item_Fat_Content'].unique()

In [None]:
df['Item_Fat_Content'].value_counts()

### Convert outlet_establishment_year into years of operation

In [None]:
df['Operation_Years'] = 2019-df['Outlet_Establishment_Year']

### Encoding of Categorical Values

In [None]:
le = LabelEncoder()

In [None]:
cols = df.select_dtypes('object').columns.to_list()
cols.remove('df_type')
cols

In [None]:
for col in cols: 
    df[col]= le.fit_transform(df[col])

In [None]:
df.info()

In [None]:
df.head()

### Drop unneccessary columns

In [None]:
cols_to_drop = ['Item_Identifier', 'Outlet_Establishment_Year']

In [None]:
df.drop(columns=cols_to_drop, inplace=True)

### Normalize the Data

In [None]:
scaler = MinMaxScaler()
X_train_sc= scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

### Split Train and Test Set

In [None]:
train = df[df['df_type']=='train']
test = df[df['df_type']=='test']

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop(columns='df_type', inplace=True)
test.drop(columns=['df_type','Item_Outlet_Sales'], inplace=True)

In [None]:
X = train.drop(columns='Item_Outlet_Sales')
y = train['Item_Outlet_Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,  test_size=0.2, shuffle = True, random_state = 42)

In [None]:
X

In [None]:
y