# Mini Project Notebook

## 1.0 Loading Packages and Data

In [33]:
import pandas as pd

train_file = './Mini Project Dataset/Train.csv'
# test_file = './Mini Project Dataset/Test.csv' 

train_data = pd.read_csv(train_file)
# test_data = pd.read_csv(test_file)

## 2.0 Exploratory Data Analysis 
### 2.1 Basic Data Exploration

In [34]:
train_data.head() # prints first five rows

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [35]:
train_data.shape # returns the number of rows and columns in the dataset

(8523, 12)

In [36]:
train_data.info() # returns the count of records in each column
                  # and the type stored

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [37]:
train_data.describe() # return some statistical values for each column

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


### 2.2 Handling Missing Values

In [38]:
train_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [39]:
from sklearn.impute import SimpleImputer
import numpy as np

# since we found only null values in item_weight and Outlet_Size, we will handle only those column separately
item_weight = train_data.select_dtypes(include=["float64"]).columns.tolist()[0]
outlet_size = train_data.select_dtypes(include=["object"]).columns.tolist()[4]

imputer_median = SimpleImputer(missing_values=np.nan, strategy='median')
train_data[item_weight] = imputer_median.fit_transform(train_data[[item_weight]])

imputer_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_data[outlet_size] = imputer_most_freq.fit_transform(train_data[[outlet_size]])

train_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

### 2.3 Handling Duplicates

In [40]:
train_data.duplicated().sum() # No duplicates in the dataset

0

### 2.4 Handling Outliers

In [41]:
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
def detect_outliers(col):
    col = col.to_numpy()
    sorted(col)
    q1 = np.quantile(col,0.25)
    q3 = np.quantile(col,0.75)
    IQR = q3 - q1
    lower_bound = q1 - (1.5 * IQR)
    upper_bound = q3 + (1.5 * IQR)
    removed_outliers = np.where(col < lower_bound, lower_bound, col)
    removed_outliers = np.where(col > upper_bound, upper_bound, col)
    return pd.Series(removed_outliers)

In [49]:
train_data.corr()

  train_data.corr()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
Item_Weight,1.0,-0.013744,0.024951,0.007739,0.010887
Item_Visibility,-0.013744,1.0,-0.001163,-0.064105,-0.120418
Item_MRP,0.024951,-0.001163,1.0,0.00502,0.574554
Outlet_Establishment_Year,0.007739,-0.064105,0.00502,1.0,-0.037133
Item_Outlet_Sales,0.010887,-0.120418,0.574554,-0.037133,1.0
