In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r'C:\Users\dell\OneDrive\Desktop\DataAnalysis\amazon.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Data Exploration

# Checking for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Basic information about the dataset
print("\nBasic information about the dataset:")
print(df.info())

# Summary statistics
print("\nSummary statistics of the dataset:")
print(df.describe())

# Calculate Summary Statistics for Specific Columns
# Assuming 'Sales' is a column in the dataset, replace it with actual column names if different
if 'Sales' in df.columns:
    mean_sales = df['Sales'].mean()
    median_sales = df['Sales'].median()
    std_sales = df['Sales'].std()
    print(f"\nMean Sales: {mean_sales}")
    print(f"Median Sales: {median_sales}")
    print(f"Standard Deviation of Sales: {std_sales}")

# Visualization

# Histogram of sales
if 'Sales' in df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(df['Sales'], bins=20, edgecolor='k')
    plt.title('Distribution of Sales')
    plt.xlabel('Sales')
    plt.ylabel('Frequency')
    plt.show()

# Scatter plot of sales vs date
# Assuming 'Date' is a column in the dataset and is in a proper datetime format, replace it with actual column names if different
if 'Date' in df.columns and 'Sales' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime if not already
    plt.figure(figsize=(12, 6))
    plt.scatter(df['Date'], df['Sales'])
    plt.title('Sales over Time')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.show()

# Grouping Data
# Assuming 'Product' is a column in the dataset, replace it with actual column names if different
if 'Product' in df.columns and 'Sales' in df.columns:
    grouped_data = df.groupby('Product')['Sales'].sum()
    print("\nTotal Sales by Product:")
    print(grouped_data)

# Filtering Data
# Assuming 'Product A' is one of the products, replace it with actual product names if different
if 'Product' in df.columns:
    product_a_sales = df[df['Product'] == 'Product A']
    print("\nSales records for Product A:")
    print(product_a_sales)

# Sorting Data
# Assuming 'Date' is a column in the dataset, replace it with actual column names if different
if 'Date' in df.columns:
    sorted_data = df.sort_values(by='Date')
    print("\nSales records sorted by Date:")
    print(sorted_data.head())


First few rows of the dataset:
   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1    