In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your data
file_path = r'C:\Users\vishw\Downloads\retail_sales_dataset.csv'  # Raw string notation to handle backslashes
print(f"Attempting to load file from: {file_path}")

try:
    df = pd.read_csv(file_path)
    print("Columns in the dataset:", df.columns)

    # Handle missing values (e.g., drop rows with missing values or fill them with appropriate values)
    df = df.dropna()

    # Convert the 'purchase_date' column to datetime
    if 'purchase_date' in df.columns:
        df['purchase_date'] = pd.to_datetime(df['purchase_date'])

    # Display the updated dataframe
    print(df.info())
    print(df.head())

    # Analyze age distribution
    age_distribution = df['age'].describe()
    print("Age Distribution:\n", age_distribution)

    # Analyze gender distribution
    gender_distribution = df['gender'].value_counts()
    print("Gender Distribution:\n", gender_distribution)

    # Analyze location distribution
    location_distribution = df['location'].value_counts()
    print("Location Distribution:\n", location_distribution)

    # Analyze product preferences
    product_preferences = df['product'].value_counts()
    print("Product Preferences:\n", product_preferences)

    # Analyze purchase frequency
    purchase_frequency = df['customer_id'].value_counts()
    print("Purchase Frequency:\n", purchase_frequency)

    # Analyze average purchase amount
    average_purchase_amount = df['purchase_amount'].mean()
    print("Average Purchase Amount:\n", average_purchase_amount)

    # Plot age distribution
    plt.figure(figsize=(10, 6))
    df['age'].hist(bins=20)
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()

    # Plot gender distribution
    plt.figure(figsize=(10, 6))
    df['gender'].value_counts().plot(kind='bar')
    plt.title('Gender Distribution')
    plt.xlabel('Gender')
    plt.ylabel('Count')
    plt.show()

    # Plot location distribution
    plt.figure(figsize=(10, 6))
    df['location'].value_counts().plot(kind='bar')
    plt.title('Location Distribution')
    plt.xlabel('Location')
    plt.ylabel('Count')
    plt.show()

    # Plot product preferences
    plt.figure(figsize=(10, 6))
    df['product'].value_counts().plot(kind='bar')
    plt.title('Product Preferences')
    plt.xlabel('Product')
    plt.ylabel('Count')
    plt.show()

    # Plot purchase frequency
    plt.figure(figsize=(10, 6))
    df['customer_id'].value_counts().plot(kind='bar')
    plt.title('Purchase Frequency')
    plt.xlabel('Customer ID')
    plt.ylabel('Frequency')
    plt.show()

except FileNotFoundError:
    print(f"File not found: {file_path}. Please check the file path and try again.")
except ValueError as ve:
    print(f"ValueError: {ve}")
except Exception as e:
    print(f"An error occurred: {e}")

Attempting to load file from: C:\Users\vishw\Downloads\retail_sales_dataset.csv
Columns in the dataset: Index(['Transaction ID', 'Date', 'Customer ID', 'Gender', 'Age',
       'Product Category', 'Quantity', 'Price per Unit', 'Total Amount'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB
None
   Transaction ID        Date Customer ID  Gender  Age Product Categ