# EDA

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

In [3]:
from ydata_profiling import ProfileReport

In [4]:
%%time
df = pd.read_csv("../Data/Raw.csv")

CPU times: user 9.27 s, sys: 2.02 s, total: 11.3 s
Wall time: 11.5 s


In [5]:
df.sample(10)

Unnamed: 0,Location ID,City,State,Country,Latitude,Longitude,Product ID,Product Category,Sales Volume,Sales Revenue,Date
7940657,7940658,Los Angeles,CA,USA,34.0522,-118.2437,FOOD6001,Organic Snacks,96.0,1959.44,2024-05-20
357245,357246,Delhi,Delhi,India,28.7041,77.1025,TOYS4003,Board Game,62.0,4293.47,2023-02-05
9426917,9426918,Toronto,,Canada,43.651,-79.347,HOME3003,Bed,123.2,117302.82,2023-12-23
3189129,3189130,New York,NY,USA,40.7128,-74.006,ELEC1001,Smartphone,36.0,47936.37,2024-03-02
7207366,7207367,Toronto,,Canada,43.651,-79.347,ELEC1002,Laptop,39.0,95881.48,2022-01-13
5920420,5920421,New York,NY,USA,40.7128,-74.006,HOME3001,Sofa,105.0,90203.02,2024-02-22
1023145,1023146,Chennai,Tamil Nadu,India,13.0827,80.2707,FOOD6005,Energy Drinks,106.8,2819.35,2023-06-25
4579862,4579863,Chicago,IL,USA,41.8781,-87.6298,TOYS4004,Puzzle,69.0,917.07,2022-06-08
9175405,9175406,Los Angeles,CA,USA,34.0522,-118.2437,BOOK5001,Novel,53.0,1719.19,2023-03-06
5336227,5336228,Kolkata,West Bengal,India,22.5726,88.3639,HOME3004,Microwave,98.0,15818.45,2023-05-22


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Location ID       int64  
 1   City              object 
 2   State             object 
 3   Country           object 
 4   Latitude          float64
 5   Longitude         float64
 6   Product ID        object 
 7   Product Category  object 
 8   Sales Volume      float64
 9   Sales Revenue     float64
 10  Date              object 
dtypes: float64(4), int64(1), object(6)
memory usage: 839.2+ MB


In [7]:
df.isna().sum()

Location ID               0
City                      0
State               3706818
Country                   0
Latitude                  0
Longitude                 0
Product ID                0
Product Category          0
Sales Volume              0
Sales Revenue             0
Date                      0
dtype: int64

In [8]:
df.isnull().sum()

Location ID               0
City                      0
State               3706818
Country                   0
Latitude                  0
Longitude                 0
Product ID                0
Product Category          0
Sales Volume              0
Sales Revenue             0
Date                      0
dtype: int64

In [9]:
null_per = df.isna().mean()*100
null_per

Location ID          0.00000
City                 0.00000
State               37.06818
Country              0.00000
Latitude             0.00000
Longitude            0.00000
Product ID           0.00000
Product Category     0.00000
Sales Volume         0.00000
Sales Revenue        0.00000
Date                 0.00000
dtype: float64

In [10]:
df.duplicated().sum()

0

In [11]:
df['Country'].unique()

array(['Japan', 'India', 'Canada', 'USA', 'UK', 'Germany', 'France',
       'Australia'], dtype=object)

In [12]:
df['State'].unique()

array([nan, 'Telangana', 'Delhi', 'IL', 'NY', 'Tamil Nadu', 'Maharashtra',
       'Karnataka', 'TX', 'CA', 'West Bengal', 'FL'], dtype=object)

In [13]:
df['City'].unique()

array(['Tokyo', 'Hyderabad', 'Delhi', 'Toronto', 'Chicago', 'New York',
       'Chennai', 'Mumbai', 'Bangalore', 'Houston', 'London', 'Berlin',
       'Paris', 'Sydney', 'Los Angeles', 'Kolkata', 'Miami'], dtype=object)

pip install geopy

%%time
from geopy.geocoders import Nominatim
import time

geolocator = Nominatim(user_agent="geoapi")

def get_state_from_coords(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), language='en')
        address = location.raw.get('address', {})
        return address.get('state')
    except:
        return None

# Apply only to rows where State is missing
df['State'] = df.apply(
    lambda row: get_state_from_coords(row['Latitude'], row['Longitude']) if pd.isnull(row['State']) else row['State'],
    axis=1
)

In [14]:
city_to_state = {
    'Tokyo': 'Tokyo Prefecture',       # Not in your state list; added a logical equivalent
    'Hyderabad': 'Telangana',
    'Delhi': 'Delhi',
    'Toronto': 'Ontario',              # Not in your state list; Canada province
    'Chicago': 'IL',
    'New York': 'NY',
    'Chennai': 'Tamil Nadu',
    'Mumbai': 'Maharashtra',
    'Bangalore': 'Karnataka',
    'Houston': 'TX',
    'London': 'England',               # Not in your state list; added logical equivalent
    'Berlin': 'Berlin State',         # Not in your state list
    'Paris': 'Île-de-France',         # Not in your state list
    'Sydney': 'New South Wales',      # Not in your state list
    'Los Angeles': 'CA',
    'Kolkata': 'West Bengal',
    'Miami': 'FL'
}

In [15]:
%%time
df['State'] = df.apply(
    lambda row: city_to_state.get(row['City']) if pd.isnull(row['State']) else row['State'],
    axis=1
)

CPU times: user 56.1 s, sys: 1.85 s, total: 58 s
Wall time: 58 s


In [None]:
df.to_csv("../Data/processed.csv",index=False)

In [18]:
#Get Ydata profiling report and storing it.
profile = ProfileReport(df, title="My Dataset Report", explorative=True)

# Export to HTML
profile.to_file("../reports/eda_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                       | 0/11 [00:00<?, ?it/s][A
  9%|████▎                                          | 1/11 [01:10<11:44, 70.42s/it][A
100%|██████████████████████████████████████████████| 11/11 [01:12<00:00,  6.57s/it][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
#See the report in this noteboook
profile.to_notebook_iframe()