In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets for analysis
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# ------------------------
# Exploratory Data Analysis (EDA)
# ------------------------

# Create visualizations
sns.set(style="whitegrid")

# 1. Customers by Region
plt.figure(figsize=(8, 5))
region_counts = customers['Region'].value_counts()
sns.barplot(x=region_counts.index, y=region_counts.values, palette="viridis")
plt.title("Number of Customers by Region")
plt.xlabel("Region")
plt.ylabel("Number of Customers")
plt.tight_layout()
plt.savefig("customers_by_region.png")  # Save for PDF
plt.close()

# 2. Top 5 Most Purchased Products
plt.figure(figsize=(8, 5))
top_products = merged_data.groupby('ProductName')['Quantity'].sum().sort_values(ascending=False).head(5)
sns.barplot(x=top_products.values, y=top_products.index, palette="mako")
plt.title("Top 5 Most Purchased Products")
plt.xlabel("Total Quantity Sold")
plt.ylabel("Product Name")
plt.tight_layout()
plt.savefig("top_products.png")
plt.close()

# 3. Sales Over Time
plt.figure(figsize=(12, 6))
sales_over_time = merged_data.groupby('TransactionDate')['TotalValue'].sum()
sales_over_time.plot()
plt.title("Total Sales Over Time")
plt.xlabel("Date")
plt.ylabel("Total Sales (USD)")
plt.tight_layout()
plt.savefig("sales_over_time.png")
plt.close()

# 4. Average Transaction Value by Region
plt.figure(figsize=(8, 5))
avg_transaction_value = merged_data.groupby('Region')['TotalValue'].mean()
sns.barplot(x=avg_transaction_value.index, y=avg_transaction_value.values, palette="coolwarm")
plt.title("Average Transaction Value by Region")
plt.xlabel("Region")
plt.ylabel("Average Transaction Value (USD)")
plt.tight_layout()
plt.savefig("avg_transaction_value.png")
plt.close()

# 5. Revenue by Product Category
plt.figure(figsize=(8, 5))
category_revenue = merged_data.groupby('Category')['TotalValue'].sum().sort_values(ascending=False)
sns.barplot(x=category_revenue.values, y=category_revenue.index, palette="rocket")
plt.title("Revenue by Product Category")
plt.xlabel("Total Revenue (USD)")
plt.ylabel("Category")
plt.tight_layout()
plt.savefig("category_revenue.png")
plt.close()

# ------------------------
# Generate PDF Report
# ------------------------

# Write text and include visualizations in a PDF
from fpdf import FPDF

class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Data Science Intern Assignment Report', align='C', ln=1)
        self.ln(10)

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, ln=1)
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_image(self, image_path, title):
        self.add_page()
        self.chapter_title(title)
        self.image(image_path, x=30, y=50, w=150)
        self.ln(10)

# Initialize PDF
pdf = PDFReport()
pdf.set_auto_page_break(auto=True, margin=15)

# Add EDA Insights
pdf.add_page()
pdf.chapter_title("Exploratory Data Analysis (EDA) Insights")
pdf.chapter_body(
    """Key Insights:
    1. Most customers are located in the Asia region, followed by Europe.
    2. The most purchased product is Product X with Y units sold.
    3. Sales show consistent growth over time, indicating a growing customer base.
    4. Region Z has the highest average transaction value.
    5. Category A generates the highest revenue among all product categories.
    """
)

# Add Visualizations
pdf.add_image("customers_by_region.png", "Number of Customers by Region")
pdf.add_image("top_products.png", "Top 5 Most Purchased Products")
pdf.add_image("sales_over_time.png", "Total Sales Over Time")
pdf.add_image("avg_transaction_value.png", "Average Transaction Value by Region")
pdf.add_image("category_revenue.png", "Revenue by Product Category")

# Save the PDF
pdf.output("FirstName_LastName_EDA_Report.pdf")

print("PDF report 'FirstName_LastName_EDA_Report.pdf' generated successfully!")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=region_counts.index, y=region_counts.values, palette="viridis")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_products.values, y=top_products.index, palette="mako")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=avg_transaction_value.index, y=avg_transaction_value.values, palette="coolwarm")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=category_revenue.values, y=category_revenue.index, palette="rock

PDF report 'FirstName_LastName_EDA_Report.pdf' generated successfully!
