In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- PART 1: GENERATE DATA ---
np.random.seed(42)
n_rows = 1000
sales_data = {
    'transaction_id': [f'TRX-{i}' for i in range(n_rows)],
    'customer_id': np.random.randint(100, 150, n_rows),
    'order_date': [datetime(2025, 12, 1) + timedelta(days=np.random.randint(0, 60)) for _ in range(n_rows)],
    'revenue': np.random.uniform(20.0, 500.0, n_rows),
    'product_category': np.random.choice(['Electronics', 'Home', 'Apparel', 'Beauty'], n_rows),
    'shipping_status': np.random.choice(['Delivered', 'Shipped', 'Delayed', 'Cancelled'], n_rows, p=[0.7, 0.15, 0.1, 0.05])
}
df_sales = pd.DataFrame(sales_data)

# Inject messiness
df_sales.iloc[0:10, 3] = np.nan 
df_sales = pd.concat([df_sales, df_sales.iloc[0:5]]) 

# Fix the array length error from before (24 reviews)
review_list = ["Love it, but late.", "Broke immediately.", "Great value.", "Damaged box.", "Great support!", "Long wait time.", "High quality.", "Third time delayed!"] * 3
df_reviews = pd.DataFrame({
    'customer_id': np.random.randint(100, 150, 24),
    'review_text': review_list
})

# --- PART 2: CLEAN DATA (The Senior Analyst Way) ---
# 1. Deduplicate
df_sales = df_sales.drop_duplicates(subset=['transaction_id'])

# 2. Impute Nulls by Category Median
df_sales['revenue'] = df_sales.groupby('product_category')['revenue'].transform(lambda x: x.fillna(x.median()))

# 3. Quick Sentiment Tagging
def quick_tag(text):
    text = text.lower()
    if any(word in text for word in ['late', 'delayed', 'wait']): return 'Shipping/Service'
    if any(word in text for word in ['broke', 'quality']): return 'Product Quality'
    return 'General/Positive'

df_reviews['issue_category'] = df_reviews['review_text'].apply(quick_tag)

print("âœ… Success! Data generated and cleaned.")
print(f"Sales Data Shape: {df_sales.shape}")
print(df_reviews[['review_text', 'issue_category']].head())


# Save the cleaned data to your computer
df_sales.to_csv('transactions_cleaned.csv', index=False)
df_reviews.to_csv('reviews_cleaned.csv', index=False)

print("ðŸ“‚ Success! 'transactions_cleaned.csv' and 'reviews_cleaned.csv' are now saved in your folder.")

âœ… Success! Data generated and cleaned.
Sales Data Shape: (1000, 6)
          review_text    issue_category
0  Love it, but late.  Shipping/Service
1  Broke immediately.   Product Quality
2        Great value.  General/Positive
3        Damaged box.  General/Positive
4      Great support!  General/Positive
ðŸ“‚ Success! 'transactions_cleaned.csv' and 'reviews_cleaned.csv' are now saved in your folder.
