In [56]:
# Import necessary libraries
import pandas as pd

In [57]:
# ### Read CSV Files
# Load the training and testing datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [58]:
# ### Check for Duplicates
# Concatenate the training and testing datasets to check for duplicates
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Identify duplicated rows in the combined dataset
duplicates = combined_data.duplicated()

# Check if any duplicates exist and print the results
if duplicates.any():
    print("There are duplicated rows.")
    print(combined_data[duplicates])  # Show the duplicated rows
else:
    print("There are no duplicated rows.")

There are no duplicated rows.


In [59]:
# ### Handle Missing Values in 'Item_Weight'
# The 'Item_Weight' column has missing values that need to be addressed.
# Group by 'Item_Identifier' and calculate descriptive statistics to find a suitable replacement for missing values
weight_statistics = train_data.groupby('Item_Identifier').agg(
    mean=('Item_Weight', 'mean'),
    std=('Item_Weight', 'std'),
    min=('Item_Weight', 'min'),
    q25=('Item_Weight', lambda x: x.quantile(0.25)),
    q50=('Item_Weight', 'median'),
    q75=('Item_Weight', lambda x: x.quantile(0.75)),
    max=('Item_Weight', 'max')
)

# ### Remove Irrelevant Records
# Identify items with a single record that have a null weight
# Since there are only 4 such records in over 8k, they will be removed
records_to_remove = ['FDN52', 'FDK57', 'FDE52', 'FDQ60']
train_data = train_data[~train_data['Item_Identifier'].isin(records_to_remove)]

# ### Fill Missing Values with Means
# Replace null values in 'Item_Weight' with the mean weight of each item
mean_weight_per_item = train_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
train_data['Item_Weight'] = train_data['Item_Weight'].fillna(mean_weight_per_item)

In [61]:
# ### Standardize 'Item_Fat_Content' Categories
# Unify categories that represent the same concept
category_replacements = {
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
}
train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace(category_replacements)

In [62]:
# ### Consolidate Rare Categories in 'Item_Type'
# Replace categories with fewer than 200 records as they may be underrepresented
train_data.loc[train_data['Item_Type'].isin(['Starchy Foods', 'Breakfast', 'Seafood']), 'Item_Type'] = 'Others'
