In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("Global_Superstore_Sample.csv")

print("First 5 Rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())


First 5 Rows:
   Order ID  Order Date Customer Name   Region         Category       Sales  \
0      1001  2023-01-01       Priya K    South  Office Supplies   58.737199   
1      1002  2023-01-02       David L  Central        Furniture  146.397966   
2      1003  2023-01-03      John Doe     East       Technology  680.326681   
3      1004  2023-01-04       David L    South        Furniture   54.808505   
4      1005  2023-01-05       David L    South        Furniture  202.767649   

   Quantity      Profit  
0         2  177.271437  
1         3 -148.438066  
2         1  251.276453  
3         8   52.626186  
4         3  213.228733  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order ID       101 non-null    int64  
 1   Order Date     101 non-null    object 
 2   Customer Name  101 non-null    object 
 3   Region     

In [3]:
print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
Order ID         0
Order Date       0
Customer Name    0
Region           0
Category         0
Sales            1
Quantity         0
Profit           1
dtype: int64


In [4]:
df["Sales"].fillna(df["Sales"].mean(), inplace=True)
df["Profit"].fillna(df["Profit"].mean(), inplace=True)

print("\nMissing Values After Handling:")
print(df.isnull().sum())



Missing Values After Handling:
Order ID         0
Order Date       0
Customer Name    0
Region           0
Category         0
Sales            0
Quantity         0
Profit           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Sales"].fillna(df["Sales"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Profit"].fillna(df["Profit"].mean(), inplace=True)


In [5]:
print("\nDuplicate Rows:", df.duplicated().sum())

df.drop_duplicates(inplace=True)

print("Duplicates After Removal:", df.duplicated().sum())



Duplicate Rows: 1
Duplicates After Removal: 0


In [6]:
df["Order Date"] = pd.to_datetime(df["Order Date"])

print("\nUpdated Data Types:")
print(df.dtypes)



Updated Data Types:
Order ID                  int64
Order Date       datetime64[ns]
Customer Name            object
Region                   object
Category                 object
Sales                   float64
Quantity                  int64
Profit                  float64
dtype: object


In [7]:
df.to_csv("Cleaned_Global_Superstore.csv", index=False)

print("Cleaned dataset exported successfully!")


Cleaned dataset exported successfully!


In [8]:
print("\nSummary Statistics:")
print(df.describe())



Summary Statistics:
          Order ID           Order Date       Sales    Quantity      Profit
count   100.000000                  100  100.000000  100.000000  100.000000
mean   1050.500000  2023-02-19 12:00:00  542.280106    4.580000   59.009947
min    1001.000000  2023-01-01 00:00:00   54.808505    1.000000 -194.581174
25%    1025.750000  2023-01-25 18:00:00  320.844078    2.000000  -59.966156
50%    1050.500000  2023-02-19 12:00:00  572.236801    5.000000   62.699738
75%    1075.250000  2023-03-16 06:00:00  755.089349    7.000000  195.054184
max    1100.000000  2023-04-10 00:00:00  990.551158    9.000000  296.482398
std      29.011492                  NaN  275.325189    2.559119  156.542351
