In [None]:
# Import necessary libraries
import pandas as pd
from google.colab import files

# Upload the dataset
uploaded = files.upload()

# Load the dataset
data = pd.read_csv('online_retail.csv')

# Checking for missing values
print("Missing values per column:")
print(data.isnull().sum())

# Remove duplicates
data = data.drop_duplicates()
print(f"Number of duplicate rows after cleaning: {data.duplicated().sum()}")

# Handle missing values by dropping rows with missing CustomerID or Description
data = data.dropna(subset=['CustomerID', 'Description'])
print("Missing values after removing incomplete rows:")
print(data.isnull().sum())

# Remove rows with invalid data (Quantity <= 0 or UnitPrice <= 0)
data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]
print(f"Rows after removing invalid data: {data.shape[0]}")

# Convert InvoiceDate to datetime format
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
print(f"Data type of InvoiceDate: {data['InvoiceDate'].dtype}")

# Add a new column for total price
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']
print(data.head())

# Save the cleaned dataset to a CSV file
data.to_csv('cleaned_online_retail.csv', index=False)
files.download('cleaned_online_retail.csv')
