Install Required Libraries

In [None]:
pip install pandas matplotlib seaborn fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=4fa5b6b75bb980095cca4125cb3b2ed5b341c006ac4f780d515ce75a603b96d1
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from scipy import stats
from fpdf import FPDF
from mpl_toolkits.mplot3d import Axes3D  # Import for 3D plotting

Load the Data

In [None]:
import pandas as pd

data = pd.read_csv('mall_customers.csv')
print("Data loaded successfully!")
print(data.head())


Data loaded successfully!
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


Data Cleaning

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())


Missing values in each column:
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


In [None]:
# Drop rows with missing values if necessary
data.dropna(inplace=True)
print("\nMissing values after cleaning:")
print(data.isnull().sum())


Missing values after cleaning:
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
data.drop_duplicates(inplace=True)



Number of duplicate rows: 0


In [None]:
# Convert data types if necessary
data['Gender'] = data['Gender'].astype('category')

In [None]:
# Convert 'Gender' to numeric values
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})

In [None]:
# Check for outliers using IQR for Age and Annual Income
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

data = remove_outliers(data, 'Age')
data = remove_outliers(data, 'Annual Income (k$)')

original_size = len(data)
data = remove_outliers(data, 'Age')
age_outliers_removed = original_size - len(data)
print(f"Removed {age_outliers_removed} outliers from Age.")

original_size = len(data)
data = remove_outliers(data, 'Annual Income (k$)')
income_outliers_removed = original_size - len(data)
print(f"Removed {income_outliers_removed} outliers from Annual Income.")


Removed 0 outliers from Age.
Removed 0 outliers from Annual Income.


In [None]:
# Display basic information about the cleaned dataset
print("\nDataset Info after cleaning:")
print(data.info())


Dataset Info after cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 0 to 197
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   CustomerID              198 non-null    int64   
 1   Gender                  198 non-null    category
 2   Age                     198 non-null    int64   
 3   Annual Income (k$)      198 non-null    int64   
 4   Spending Score (1-100)  198 non-null    int64   
dtypes: category(1), int64(4)
memory usage: 8.0 KB
None


Statistical Analysis

In [None]:
def perform_statistical_analysis(data):
    statistical_results = {}

    # Mean and Standard Deviation
    mean_value = data['Age'].mean()
    std_dev_value = data['Age'].std()
    statistical_results['Mean of Age'] = mean_value
    statistical_results['Standard Deviation of Age'] = std_dev_value

    # Hypothesis Testing (one-sample t-test)
    t_statistic, p_value = stats.ttest_1samp(data['Spending Score (1-100)'], popmean=50)
    statistical_results['T-Test t-statistic'] = t_statistic
    statistical_results['T-Test p-value'] = p_value

    # Causal Analysis (Correlation)
    correlation = data['Annual Income (k$)'].corr(data['Spending Score (1-100)'])
    statistical_results['Correlation between Annual Income and Spending Score'] = correlation

    # Linear Regression
    X = data[['Annual Income (k$)']]
    y = data['Spending Score (1-100)']
    model = LinearRegression()
    model.fit(X, y)
    statistical_results['Linear Regression Coefficient'] = model.coef_[0]
    statistical_results['Linear Regression Intercept'] = model.intercept_

    return statistical_results

Visualizations

In [None]:
def create_visualizations(data):
    if 'age_group' in data.columns:
        data['age_group'] = data['age_group'].astype('category')

    # Distribution of Age
    plt.figure(figsize=(10, 6))
    sns.histplot(data['age'], bins=30, kde=True)
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.savefig('age_distribution.png')
    plt.close()

    # Boxplot of Spending Score by Age Group
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='age_group', y='spending_score', data=data)
    plt.title('Boxplot of Spending Score by Age Group')
    plt.xlabel('Age Group')
    plt.ylabel('Spending Score')
    plt.savefig('boxplot_spending_score.png')
    plt.close()

    # Scatter Plot: Annual Income vs. Spending Score
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='annual_income', y='spending_score', data=data)
    plt.title('Scatter Plot: Annual Income vs. Spending Score')
    plt.xlabel('Annual Income')
    plt.ylabel('Spending Score')
    plt.savefig('income_vs_spending_score.png')
    plt.close()

    # Gender Distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='gender', data=data)
    plt.title('Gender Distribution')
    plt.xlabel('Gender')
    plt.ylabel('Count')
    plt.savefig('gender_distribution.png')
    plt.close()

    # Count of Customers by Age Group
    plt.figure(figsize=(10, 6))
    sns.countplot(x='age_group', data=data)
    plt.title('Count of Customers by Age Group')
    plt.xlabel('Age Group')
    plt.ylabel('Count')
    plt.savefig('age_group_counts.png')
    plt.close()

    # Boxplot of Annual Income by Gender
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='gender', y='annual_income', data=data)
    plt.title('Boxplot of Annual Income by Gender')
    plt.xlabel('Gender')
    plt.ylabel('Annual Income')
    plt.savefig('income_by_gender.png')
    plt.close()

    # Correlation Matrix Heatmap
    plt.figure(figsize=(10, 6))
    corr = data.corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title('Correlation Matrix Heatmap')
    plt.savefig('correlation_matrix_heatmap.png')
    plt.close()

    # Pair Plot of Key Features
    sns.pairplot(data[['annual_income', 'spending_score', 'age']])
    plt.savefig('pairplot.png')
    plt.close()

    # Customer Segmentation (2D)
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='annual_income', y='spending_score', hue='age_group', data=data, palette='deep')
    plt.title('Customer Segmentation (2D)')
    plt.xlabel('Annual Income')
    plt.ylabel('Spending Score')
    plt.savefig('customer_segmentation_2D.png')
    plt.close()

    # Customer Segmentation (3D)
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(data['annual_income'], data['spending_score'], data['age'], c=data['age_group'].cat.codes, cmap='viridis')
    ax.set_xlabel('Annual Income')
    ax.set_ylabel('Spending Score')
    ax.set_zlabel('Age')
    plt.title('Customer Segmentation (3D)')
    plt.savefig('customer_segmentation_3D.png')
    plt.close()


Insights and Conclusions

In [None]:
insights = {
    "Gender Distribution": data['Gender'].value_counts(),
    "Average Age": data['Age'].mean(),
    "Average Income": data['Annual Income (k$)'].mean(),
    "Average Spending Score": data['Spending Score (1-100)'].mean(),
    "Cluster Sizes": data['Cluster'].value_counts()
}

print("\nInsights:")
for key, value in insights.items():
    print(f"{key}:\n{value}\n")



Insights:
Gender Distribution:
Gender
Female    112
Male       86
Name: count, dtype: int64

Average Age:
38.92929292929293

Average Income:
59.78787878787879

Average Spending Score:
50.196969696969695

Cluster Sizes:
Cluster
0    65
2    57
1    39
3    37
Name: count, dtype: int64



In [None]:
import os
from fpdf import FPDF
from google.colab import files

def create_pdf_report(statistical_summary, output_path="customer_segmentation_statistical_analysis_report.pdf"):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Title
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(0, 10, 'Customer Segmentation Statistical Analysis Report', ln=True, align='C')

    # Section: Introduction
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, "This report presents the findings from the statistical analysis performed on the customer segmentation dataset.")

    # Section: Visualizations
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(0, 10, 'Visualizations', ln=True)

    # Add each visualization to the PDF only if it exists
    visualizations = [
        ('Distribution of Age', 'age_distribution.png'),
        ('Boxplot of Spending Score by Age', 'boxplot_spending_score.png'),
        ('Scatter Plot: Annual Income vs. Spending Score', 'income_vs_spending_score.png'),
        ('Gender Distribution', 'gender_distribution.png'),
        ('Count of Customers by Age Group', 'age_group_counts.png'),
        ('Boxplot of Annual Income by Gender', 'income_by_gender.png'),
        ('Correlation Matrix Heatmap', 'correlation_matrix_heatmap.png'),
        ('Pair Plot of Key Features', 'pairplot.png'),
        ('Customer Segmentation (2D)', 'customer_segmentation_2D.png'),
        ('Customer Segmentation (3D)', 'customer_segmentation_3D.png'),
    ]

    pdf.set_font("Arial", size=12)
    for title, img_path in visualizations:
        if os.path.exists(img_path):
            pdf.cell(0, 10, title, ln=True)
            pdf.image(img_path, x=30, w=150)  # Adjust x and w as needed
        else:
            print(f"Warning: {img_path} does not exist and will be skipped.")

    # Section: Statistical Analysis
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(0, 10, 'Statistical Analysis Results', ln=True)

    pdf.set_font("Arial", size=12)
    for key, value in statistical_summary.items():
        pdf.multi_cell(0, 10, f"{key}: {value:.2f}" if isinstance(value, (int, float)) else f"{key}: {value}")

    # Section: Insights and Conclusions
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(0, 10, 'Insights and Conclusions', ln=True)

    insights = [
        "1. The customer age distribution shows that the majority of customers fall within the age groups of 25-35 and 35-45.",
        "2. The spending score distribution indicates that a significant portion of customers has a high spending score (above 70).",
        "3. There is a positive correlation between annual income and spending score.",
        "4. The gender distribution shows a fairly balanced ratio between male and female customers.",
        "5. The KMeans clustering has successfully identified three distinct customer segments.",
    ]

    pdf.set_font("Arial", size=12)
    for insight in insights:
        pdf.multi_cell(0, 10, insight)

    # Save the PDF
    try:
        pdf.output(output_path)
        if os.path.exists(output_path):
            print(f"PDF report saved successfully as: {output_path}")
        else:
            print("Error: PDF report was not saved.")
    except Exception as e:
        print(f"An error occurred while saving the PDF: {e}")

statistical_summary = perform_statistical_analysis(data)
create_pdf_report(statistical_summary, output_path="customer_segmentation_statistical_analysis_report.pdf")  # Create the PDF

if os.path.exists("customer_segmentation_statistical_analysis_report.pdf"):
    print("PDF was created successfully.")
    # Attempt to download the PDF
    try:
        files.download("customer_segmentation_statistical_analysis_report.pdf")
    except FileNotFoundError as e:
        print(f"Download failed: {e}")
else:
    print("PDF was not created.")


PDF report saved successfully as: customer_segmentation_statistical_analysis_report.pdf
PDF was created successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>