## 1. Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os

## 2. Load the data

In [2]:
file_path = r"C:\Users\subhi.gupta\Downloads\Case studies\Sales\datasets\sales-data.csv"
df = pd.read_csv(file_path,  parse_dates=["timestamp"])
df

Unnamed: 0,timestamp,product_id,category,price,quantity,region,customer_age,channel
0,2024-01-01 00:00:00,P007,sports,41393.19,4,Mumbai,56,retail
1,2024-01-01 01:00:00,P004,grocery,38249.94,4,Delhi,29,retail
2,2024-01-01 02:00:00,P005,food,28719.09,2,Coimbatore,22,online
3,2024-01-01 03:00:00,P007,sports,47806.75,1,Chennai,19,online
4,2024-01-01 04:00:00,P003,clothing,10103.68,1,Salem,33,retail
...,...,...,...,...,...,...,...,...
4995,2024-07-27 03:00:00,P005,food,4009.08,1,Hyderabad,43,retail
4996,2024-07-27 04:00:00,P006,beauty,3721.48,1,Coimbatore,50,online
4997,2024-07-27 05:00:00,P003,clothing,42530.85,1,Mumbai,53,retail
4998,2024-07-27 06:00:00,P002,electronics,10721.22,4,Coimbatore,21,retail


## 3. Data profiling

### 3.1 Identify missing, anomalous, or inconsistent values.

In [3]:
# Get concise summary of dataframe
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   timestamp     5000 non-null   datetime64[ns]
 1   product_id    5000 non-null   object        
 2   category      5000 non-null   object        
 3   price         5000 non-null   float64       
 4   quantity      5000 non-null   int64         
 5   region        5000 non-null   object        
 6   customer_age  5000 non-null   int64         
 7   channel       5000 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 312.6+ KB


In [4]:
# Check for negative values in price, quantity and customer age columns
print(df['price'].min(), df['price'].max())
print(df['quantity'].min(), df['quantity'].max())
print(df['customer_age'].min(), df['customer_age'].max())

100.58 49977.93
1 4
18 59


### 3.2 Check for unique values for features with object dtype & check their data type too

In [5]:
df["product_id"].unique(), type(df["product_id"][0])

(array(['P007', 'P004', 'P005', 'P003', 'P008', 'P002', 'P006', 'P001'],
       dtype=object),
 str)

In [6]:
df["category"].unique(), type(df["category"][0])

(array(['sports', 'grocery', 'food', 'clothing', 'home', 'electronics',
        'beauty'], dtype=object),
 str)

In [7]:
df["region"].unique(), type(df["region"][0])

(array(['Mumbai', 'Delhi', 'Coimbatore', 'Chennai', 'Salem', 'Hyderabad',
        'Kochi', 'Bangalore'], dtype=object),
 str)

In [8]:
df["channel"].unique(), type(df["channel"][0])

(array(['retail', 'online'], dtype=object), str)

### 3.3 Fix data types

In [9]:
# Check the datatype of datatime column
type(df["timestamp"][0])

pandas._libs.tslibs.timestamps.Timestamp

In [10]:
# df['timestamp'] = pd.to_datetime(df['timestamp'])
# print(type(df["timestamp"][0]))

### 3.4 Check for duplicate rows

In [11]:
df.duplicated().sum()

0

## Conclusion through basic data profiling
- Data has no null values.
- Data has duplicate rows.
- Data has no incosistent values.