In [1]:
# import libraries
!pip install kaggle



In [2]:
# download dataset using kaggle api
import kaggle
!kaggle datasets download ankitbansal06/retail-orders -f orders.csv #Dataset is imported

Dataset URL: https://www.kaggle.com/datasets/ankitbansal06/retail-orders
License(s): CC0-1.0
orders.csv: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# read data from the file and handle null values
import pandas as pd
df = pd.read_csv('orders.csv', na_values=['Not Available', 'unknown']) # to convert the following values to NaN -- IMPORTANT

In [6]:
# check if the data is appropriate
df['ship_mode'].unique()

array(['Second Class', 'Standard Class', nan, 'First Class', 'Same Day'],
      dtype=object)

In [5]:
# rename column names, make them lower case and replace space with underscore
df.columns = [i.strip().lower().replace(' ', '_') for i in df.columns] #rename the column names to standard format

In [7]:
# derive new columns discount, sale_price and profit
df['discount'] = df['list_price'] * df['discount_percent'] * 0.01
df['sale_price'] = df['list_price'] - df['discount']
df['profit'] = df['sale_price'] - df['cost_price']

In [8]:
# covert order date from object data type to datetime
df['order_date'] = pd.to_datetime(df['order_date'], format='%Y-%m-%d') #Convert the data type from object(string) to datetime

In [9]:
# drop cost_price, list_price and discount_percent columns
df.drop(columns=['cost_price','list_price','discount_percent'],inplace=True) #columns which are not of use are dropped

In [None]:
# removing the duplicates
df.drop_duplicates(inplace=True)

In [17]:
df.sample(10)

Unnamed: 0,order_id,ship_mode,segment,country,city,state,postal_code,region,category,sub_category,product_id,quantity,discount,sale_price,profit
3796,3797,Standard Class,Consumer,United States,San Diego,California,92037,West,Office Supplies,Art,OFF-AR-10003179,4,1.6,38.4,-1.6
1050,1051,First Class,Corporate,United States,Philadelphia,Pennsylvania,19134,East,Furniture,Furnishings,FUR-FU-10000293,2,5.1,164.9,24.9
1647,1648,First Class,Home Office,United States,Philadelphia,Pennsylvania,19140,East,Office Supplies,Fasteners,OFF-FA-10001135,2,0.0,0.0,0.0
5337,5338,Standard Class,Corporate,United States,Scottsdale,Arizona,85254,West,Office Supplies,Paper,OFF-PA-10000062,7,12.4,297.6,7.6
5869,5870,Standard Class,Home Office,United States,Cheyenne,Wyoming,82001,West,Furniture,Chairs,FUR-CH-10001215,4,64.0,1536.0,156.0
9309,9310,Standard Class,Consumer,United States,San Francisco,California,94110,West,Office Supplies,Art,OFF-AR-10000034,6,1.5,28.5,-1.5
2729,2730,Standard Class,Consumer,United States,Seattle,Washington,98105,West,Office Supplies,Art,OFF-AR-10001547,3,0.3,9.7,-0.3
5940,5941,Second Class,Consumer,United States,Los Angeles,California,90049,West,Technology,Accessories,TEC-AC-10001465,12,8.8,431.2,61.2
3182,3183,Second Class,Corporate,United States,Columbia,Maryland,21044,East,Technology,Accessories,TEC-AC-10004666,3,16.6,813.4,153.4
1028,1029,Standard Class,Corporate,United States,Hackensack,New Jersey,7601,East,Office Supplies,Paper,OFF-PA-10000474,5,9.0,171.0,11.0
