# Pre-processing the data obtained from the Karnataka Govt. website

## Importing the necessary libraries


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Reading the data

In [2]:
bengaluru_df = pd.read_csv('krama_report_bengaluru.csv')

all_markets = ['bengaluru', 'doddaballapur', 'hubballi', 'mysuru']


## Describing the data

In [3]:
bengaluru_df.head()

Unnamed: 0,Market,Date,Variety,Grade,Arrivals,Unit,Min,Max,Modal,District
0,BENGALURU,07/02/2002,ONION,FAQ,14350,Quintal,260,360,310,Bengaluru Urban�
1,BENGALURU,07/02/2002,OTHER,FAQ,0,Quintal,150,240,195,Bengaluru Urban�
2,BENGALURU,27/03/2002,ONION,FAQ,10818,Quintal,170,280,225,Bengaluru Urban�
3,BENGALURU,28/03/2002,ONION,FAQ,8048,Quintal,190,300,245,Bengaluru Urban�
4,BENGALURU,30/03/2002,ONION,FAQ,16590,Quintal,1800,280,230,Bengaluru Urban�


In [4]:
bengaluru_df.describe()

Unnamed: 0,Arrivals,Min,Max,Modal
count,16993.0,16993.0,16993.0,16993.0
mean,8638.172895,991.195198,1414.385982,1196.057377
std,11639.161578,868.655986,3985.689866,986.092362
min,0.0,0.0,0.0,0.0
25%,978.0,460.0,650.0,570.0
50%,5099.0,800.0,1000.0,900.0
75%,12133.0,1200.0,1800.0,1500.0
max,173949.0,13000.0,500415.0,14358.0


In [5]:
bengaluru_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16993 entries, 0 to 16992
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Market    16993 non-null  object
 1   Date      16993 non-null  object
 2   Variety   16993 non-null  object
 3   Grade     16993 non-null  object
 4   Arrivals  16993 non-null  int64 
 5   Unit      16993 non-null  object
 6   Min       16993 non-null  int64 
 7   Max       16993 non-null  int64 
 8   Modal     16993 non-null  int64 
 9   District  16993 non-null  object
dtypes: int64(4), object(6)
memory usage: 1.3+ MB


## Cleaning the data

### Null values

In [6]:
def missing_values(df):
    print("The number of missing values in each column are:", df.isnull().sum())
    print("The number of missing values in the dataframe is:", df.isnull().sum().sum())
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    print("The number of missing values after dropping rows with missing values is:", df.isnull().sum().sum())

missing_values(bengaluru_df)

The number of missing values in each column are: Market      0
Date        0
Variety     0
Grade       0
Arrivals    0
Unit        0
Min         0
Max         0
Modal       0
District    0
dtype: int64
The number of missing values in the dataframe is: 0
The number of missing values after dropping rows with missing values is: 0


### Converting Date to three columns: Day, Month, Year
We can use the split function to split the Date into three columns: Day, Month, and Year. We can then drop the Date column.

In [7]:
# Converting Date to three columns: year, month, day
def convert_date(df):
    date_format = r"%d/%m/%Y"
    df['Date'] = pd.to_datetime(df['Date'], format=date_format)
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df.drop('Date', axis=1, inplace=True)

convert_date(bengaluru_df)
bengaluru_df.head()
    

Unnamed: 0,Market,Variety,Grade,Arrivals,Unit,Min,Max,Modal,District,Year,Month,Day
0,BENGALURU,ONION,FAQ,14350,Quintal,260,360,310,Bengaluru Urban�,2002,2,7
1,BENGALURU,OTHER,FAQ,0,Quintal,150,240,195,Bengaluru Urban�,2002,2,7
2,BENGALURU,ONION,FAQ,10818,Quintal,170,280,225,Bengaluru Urban�,2002,3,27
3,BENGALURU,ONION,FAQ,8048,Quintal,190,300,245,Bengaluru Urban�,2002,3,28
4,BENGALURU,ONION,FAQ,16590,Quintal,1800,280,230,Bengaluru Urban�,2002,3,30


### Checking for redundant columns

In [8]:
# We can check the number of unique values in each column
def unique_values(df):
    for column in df.columns:
        print(column, ":", df[column].nunique())

unique_values(bengaluru_df)

Market : 1
Variety : 6
Grade : 5
Arrivals : 9739
Unit : 1
Min : 189
Max : 275
Modal : 337
District : 1
Year : 23
Month : 12
Day : 31


We can observe that the Market, Unit and District columns are redundant. We can drop these columns.

In [9]:
def remove_redundant_columns(df):
    red_cols = ["Market", "Unit", "District"]
    df.drop(red_cols, axis=1, inplace=True)

remove_redundant_columns(bengaluru_df)
bengaluru_df.head()

Unnamed: 0,Variety,Grade,Arrivals,Min,Max,Modal,Year,Month,Day
0,ONION,FAQ,14350,260,360,310,2002,2,7
1,OTHER,FAQ,0,150,240,195,2002,2,7
2,ONION,FAQ,10818,170,280,225,2002,3,27
3,ONION,FAQ,8048,190,300,245,2002,3,28
4,ONION,FAQ,16590,1800,280,230,2002,3,30


In [10]:
bengaluru_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16993 entries, 0 to 16992
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Variety   16993 non-null  object
 1   Grade     16993 non-null  object
 2   Arrivals  16993 non-null  int64 
 3   Min       16993 non-null  int64 
 4   Max       16993 non-null  int64 
 5   Modal     16993 non-null  int64 
 6   Year      16993 non-null  int32 
 7   Month     16993 non-null  int32 
 8   Day       16993 non-null  int32 
dtypes: int32(3), int64(4), object(2)
memory usage: 995.8+ KB


## Converting to a Standard format

### One-hot encoding the Variety and Grade columns

In [11]:
def one_hot_encode(df):
    categorical_cols = ["Variety", "Grade"]
    return pd.get_dummies(df, columns=categorical_cols, drop_first=True)

bengaluru_df = one_hot_encode(bengaluru_df)
bengaluru_df.head()

Unnamed: 0,Arrivals,Min,Max,Modal,Year,Month,Day,Variety_BELLARY RED,Variety_LOCAL,Variety_ONION,Variety_OTHER,Variety_PUNA,Grade_FAQ,Grade_LARGE,Grade_MEDIUM,Grade_SMALL
0,14350,260,360,310,2002,2,7,False,False,True,False,False,True,False,False,False
1,0,150,240,195,2002,2,7,False,False,False,True,False,True,False,False,False
2,10818,170,280,225,2002,3,27,False,False,True,False,False,True,False,False,False
3,8048,190,300,245,2002,3,28,False,False,True,False,False,True,False,False,False
4,16590,1800,280,230,2002,3,30,False,False,True,False,False,True,False,False,False


### Converting to float

In [12]:
# Converting to float
bengaluru_df = bengaluru_df.astype(float)
bengaluru_df.head()

Unnamed: 0,Arrivals,Min,Max,Modal,Year,Month,Day,Variety_BELLARY RED,Variety_LOCAL,Variety_ONION,Variety_OTHER,Variety_PUNA,Grade_FAQ,Grade_LARGE,Grade_MEDIUM,Grade_SMALL
0,14350.0,260.0,360.0,310.0,2002.0,2.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,150.0,240.0,195.0,2002.0,2.0,7.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,10818.0,170.0,280.0,225.0,2002.0,3.0,27.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,8048.0,190.0,300.0,245.0,2002.0,3.0,28.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,16590.0,1800.0,280.0,230.0,2002.0,3.0,30.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


### Standardizing the data

Since the column Arrivals ranges over a large scale, we can standardize the data.

In [13]:
def standardize_arrivals(df):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df[['Arrivals']] = scaler.fit_transform(df[['Arrivals']])

standardize_arrivals(bengaluru_df)
bengaluru_df.head()

Unnamed: 0,Arrivals,Min,Max,Modal,Year,Month,Day,Variety_BELLARY RED,Variety_LOCAL,Variety_ONION,Variety_OTHER,Variety_PUNA,Grade_FAQ,Grade_LARGE,Grade_MEDIUM,Grade_SMALL
0,0.490757,260.0,360.0,310.0,2002.0,2.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.742186,150.0,240.0,195.0,2002.0,2.0,7.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.187289,170.0,280.0,225.0,2002.0,3.0,27.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.050707,190.0,300.0,245.0,2002.0,3.0,28.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.683216,1800.0,280.0,230.0,2002.0,3.0,30.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## Complete preprocessing for all datasets

### Helper function

In [14]:
def preprocess(df):
    print("Preprocessing the dataframe")
    print()
    print("Handling missing values")
    missing_values(df)
    print()
    print("Converting Date to three columns: year, month, day")
    convert_date(df)
    print()
    print("Removing redundant columns")
    remove_redundant_columns(df)
    print()
    print("One hot encoding categorical columns")
    df = one_hot_encode(df)
    print()
    print("Converting to float")
    df = df.astype(float)
    print()
    print("Standardizing Arrivals")
    standardize_arrivals(df)
    print()
    return df

bengaluru_df = pd.read_csv('krama_report_bengaluru.csv')
bengaluru_df = preprocess(bengaluru_df)
bengaluru_df.head()

Preprocessing the dataframe

Handling missing values
The number of missing values in each column are: Market      0
Date        0
Variety     0
Grade       0
Arrivals    0
Unit        0
Min         0
Max         0
Modal       0
District    0
dtype: int64
The number of missing values in the dataframe is: 0
The number of missing values after dropping rows with missing values is: 0

Converting Date to three columns: year, month, day

Removing redundant columns

One hot encoding categorical columns

Converting to float

Standardizing Arrivals



Unnamed: 0,Arrivals,Min,Max,Modal,Year,Month,Day,Variety_BELLARY RED,Variety_LOCAL,Variety_ONION,Variety_OTHER,Variety_PUNA,Grade_FAQ,Grade_LARGE,Grade_MEDIUM,Grade_SMALL
0,0.490757,260.0,360.0,310.0,2002.0,2.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.742186,150.0,240.0,195.0,2002.0,2.0,7.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.187289,170.0,280.0,225.0,2002.0,3.0,27.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.050707,190.0,300.0,245.0,2002.0,3.0,28.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.683216,1800.0,280.0,230.0,2002.0,3.0,30.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


### Cleaning the data and saving it

In [15]:
for market in all_markets:
    df = pd.read_csv(f'krama_report_{market}.csv')
    df = preprocess(df)
    print(f"Saving preprocessed dataframe for {market}")
    df.to_csv(f'preprocessed_krama_report_{market}.csv', index=False)
    

Preprocessing the dataframe

Handling missing values
The number of missing values in each column are: Market      0
Date        0
Variety     0
Grade       0
Arrivals    0
Unit        0
Min         0
Max         0
Modal       0
District    0
dtype: int64
The number of missing values in the dataframe is: 0
The number of missing values after dropping rows with missing values is: 0

Converting Date to three columns: year, month, day

Removing redundant columns

One hot encoding categorical columns

Converting to float

Standardizing Arrivals

Saving preprocessed dataframe for bengaluru
Preprocessing the dataframe

Handling missing values
The number of missing values in each column are: Market      0
Date        0
Variety     0
Grade       0
Arrivals    0
Unit        0
Min         0
Max         0
Modal       0
District    0
dtype: int64
The number of missing values in the dataframe is: 0
The number of missing values after dropping rows with missing values is: 0

Converting Date to three co