# HSN Code Data Cleaning

This notebook cleans the HSN codes dataset by removing rows that contain the word 'other' (in any case) in the description column.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np

## 1. Load the Data

In [3]:
# Read the CSV file
df = pd.read_csv('HSN codes.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

Dataset Shape: (21568, 2)

Columns: ['\nHSNCode', 'Description']

First few rows:


Unnamed: 0,\nHSNCode,Description
0,1,LIVE ANIMALS
1,101,"LIVE HORSES, ASSES, MULES AND HINNIES."
2,1011010,"LIVE HORSES, ASSES, MULES AND HINNIES PURE-BRE..."
3,1011020,"LIVE HORSES, ASSES, MULESANDHINNIES PURE-BRED ..."
4,1011090,"LIVE HORSES, ASSES, MULES AND HINNIES PURE-BRE..."


## 2. Data Cleaning

In [4]:
# Count rows before cleaning
print("Number of rows before cleaning:", len(df))

# Remove rows containing 'other' in description (case-insensitive)
df_cleaned = df[~df['Description'].str.contains('other', case=False, na=False)]

# Count rows after cleaning
print("Number of rows after cleaning:", len(df_cleaned))
print("\nNumber of rows removed:", len(df) - len(df_cleaned))

Number of rows before cleaning: 21568
Number of rows after cleaning: 14071

Number of rows removed: 7497


## 3. Verify the Cleaning

In [5]:
# Check if any rows still contain 'other'
remaining_other = df_cleaned[df_cleaned['Description'].str.contains('other', case=False, na=False)]
print("Number of rows still containing 'other':", len(remaining_other))

# Display sample of cleaned data
print("\nSample of cleaned data:")
df_cleaned.head()

Number of rows still containing 'other': 0

Sample of cleaned data:


Unnamed: 0,\nHSNCode,Description
0,1,LIVE ANIMALS
1,101,"LIVE HORSES, ASSES, MULES AND HINNIES."
2,1011010,"LIVE HORSES, ASSES, MULES AND HINNIES PURE-BRE..."
3,1011020,"LIVE HORSES, ASSES, MULESANDHINNIES PURE-BRED ..."
5,10121,PURE-BRED BREEDING ANIMALS


## 4. Save Cleaned Data

In [6]:
# Save the cleaned dataset
df_cleaned.to_csv('HSN_codes_cleaned.csv', index=False)
print("Cleaned data saved to 'HSN_codes_cleaned.csv'")

Cleaned data saved to 'HSN_codes_cleaned.csv'
