# Data Cleaning
## Step 1: Loading the Dataset

In [3]:
import pandas as pd

file_path = 'data/Online_Retail.xlsx'
original_data = pd.read_excel(file_path)

original_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [11]:
original_data.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [12]:
# Output rows with any missing values
missing_rows = original_data[original_data.isnull().any(axis=1)]

# Display the result
print(missing_rows)

       InvoiceNo StockCode                      Description  Quantity  \
622       536414     22139                              NaN        56   
1443      536544     21773  DECORATIVE ROSE BATHROOM BOTTLE         1   
1444      536544     21774  DECORATIVE CATS BATHROOM BOTTLE         2   
1445      536544     21786               POLKADOT RAIN HAT          4   
1446      536544     21787            RAIN PONCHO RETROSPOT         2   
...          ...       ...                              ...       ...   
541536    581498    85099B          JUMBO BAG RED RETROSPOT         5   
541537    581498    85099C   JUMBO  BAG BAROQUE BLACK WHITE         4   
541538    581498     85150    LADIES & GENTLEMEN METAL SIGN         1   
541539    581498     85174                S/4 CACTI CANDLES         1   
541540    581498       DOT                   DOTCOM POSTAGE         1   

               InvoiceDate  UnitPrice  CustomerID         Country  
622    2010-12-01 11:52:00       0.00         NaN  Unit

In [44]:
original_data[original_data['Description'].isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1970,536545,21134,,1,2010-12-01 14:32:00,0.0,,United Kingdom
1987,536549,85226A,,1,2010-12-01 14:34:00,0.0,,United Kingdom
1988,536550,85044,,1,2010-12-01 14:34:00,0.0,,United Kingdom
2024,536552,20950,,1,2010-12-01 14:34:00,0.0,,United Kingdom
2026,536554,84670,,23,2010-12-01 14:35:00,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
280754,561498,21610,,-14,2011-07-27 14:10:00,0.0,,United Kingdom
281615,561555,37477B,,-11,2011-07-28 10:21:00,0.0,,United Kingdom
281616,561557,37477C,,-31,2011-07-28 10:21:00,0.0,,United Kingdom
346849,567207,35592T,,4,2011-09-19 11:01:00,0.0,,United Kingdom


In [43]:
original_data[original_data['StockCode'] == 37509]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1972,536547,37509,NEW ENGLAND MUG W GIFT BOX,1,2010-12-01 14:33:00,0.0,,United Kingdom
9332,537199,37509,NEW ENGLAND MUG W GIFT BOX,1,2010-12-05 14:06:00,2.55,15894.0,United Kingdom
11603,537254,37509,NEW ENGLAND MUG W GIFT BOX,1,2010-12-06 11:01:00,2.55,15898.0,United Kingdom
21785,538136,37509,NEW ENGLAND MUG W GIFT BOX,2,2010-12-09 15:57:00,0.0,,United Kingdom


In [66]:
original_data[original_data['StockCode'] == 22139]['Description'].value_counts(dropna=False)

Description
RETROSPOT TEA SET CERAMIC 11 PC    993
Name: count, dtype: int64

In [42]:
#Using Stock Code to find out the missing values In Decsription 
# Loop through rows with missing Description
for stock_code in original_data[original_data['Description'].isnull()]['StockCode'].unique():
    # Find rows with the same StockCode
    matching_rows = original_data[original_data['StockCode'] == stock_code]
    
    # Check if there are any non-missing Description values
    non_missing_description = matching_rows['Description'].dropna().unique()
    
    # If a non-missing value exists, replace the missing values
    if len(non_missing_description) > 0:
        # Use the first non-missing value (assuming all are the same)
        replacement_value = non_missing_description[0]
        
        # Update missing values in Description for this StockCode
        original_data.loc[
            (original_data['StockCode'] == stock_code) & 
            (original_data['Description'].isnull()), 
            'Description'
        ] = replacement_value

In [65]:
original_data.loc[(~original_data['Description'].astype(str).str.isupper()) & (original_data['Description'].astype(str).str.len() <= 10) & (~original_data['Description'].isnull()), 'Description'] = 'Invalid Description Naming'

In [57]:
"""
array(['Discount', nan, 'Manual', 'Amazon', '?', 'Check', 'Damages',
       'Faulty', 'Found', 'Counted', 'Given Away', 'Dotcom', 'Showroom',
       'Adjustment', 'Dotcom Set', 'Broken', 'Throw Away', '?Lost',
       'Damages?', 'Cracked', 'Damaged', 'Display', 'Missing', 'Returned',
       'Wrong Code', 'Adjust', 'Crushed', 'Samples', 'Mailout ',
       'Mailout', 'Wet/Rusty', 'Smashed', 'Ebay', '?Display?',
       'Sold As 1', '?Missing', 'Test', '??', 'Found Box', 'Dagamed',
       "Can'T Find", 'Mouldy', 'Sale Error', 'Breakages', 'Missing?',
       'Wet Rusty', '???Lost', 'John Lewis', 'Check?', '?? Missing',
       'Wet Pallet', '???Missing', 'Wet?', 'Lost??', '???', 'Wet',
       'Wet Boxes', 'Mixed Up', 'Lost'], dtype=object)
"""

array(['Discount', nan, 'Manual', 'Amazon', '?', 'Check', 'Damages',
       'Faulty', 'Found', 'Counted', 'Given Away', 'Dotcom', 'Showroom',
       'Adjustment', 'Dotcom Set', 'Broken', 'Throw Away', '?Lost',
       'Damages?', 'Cracked', 'Damaged', 'Display', 'Missing', 'Returned',
       'Wrong Code', 'Adjust', 'Crushed', 'Samples', 'Mailout ',
       'Mailout', 'Wet/Rusty', 'Smashed', 'Ebay', '?Display?',
       'Sold As 1', '?Missing', 'Test', '??', 'Found Box', 'Dagamed',
       "Can'T Find", 'Mouldy', 'Sale Error', 'Breakages', 'Missing?',
       'Wet Rusty', '???Lost', 'John Lewis', 'Check?', '?? Missing',
       'Wet Pallet', '???Missing', 'Wet?', 'Lost??', '???', 'Wet',
       'Wet Boxes', 'Mixed Up', 'Lost'], dtype=object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_uppercase['Description'] = 'Invalid Description naming'
