In [1]:
from fpdf import FPDF

# Initialize PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

# Title
pdf.set_font("Arial", "B", 17)
pdf.multi_cell(0, 10, "Customer Segmentation And Predicting Behavior", align="C")
pdf.ln(5)

def clean_text(text):
    return (
        text.replace("“", '"')
            .replace("”", '"')
            .replace("’", "'")
            .replace("–", "-")
            .replace("—", "-")
            .replace("•", "-")
            .replace("…", "...")
            .replace("•", "-")
    )

# Define structured content for each section
sections = [
    ("1. Business Situation",
     "CRISA is an Asian market research agency that specializes in tracking consumer purchase behavior in consumer goods (both durable and nondurable). In one major research project, CRISA tracks numerous consumer product categories (e.g., “detergents”), and, within each category, perhaps dozens of brands. To track purchase behavior, CRISA constituted household panels in over 100 cities and towns in India, covering most of the Indian urban market. The households were carefully selected using stratified sampling to ensure a representative sample; a subset of 600 records is analyzed here. The strata were defined on the basis of socioeconomic status and the market (a collection of cities)."),
    
    ("2. Key Problems and Objective",
     "CRISA has traditionally segmented markets on the basis of purchaser demographics. They would now like to segment the market based on two key sets of variables more directly related to the purchase process and to brand loyalty:\n1- Purchase behavior (volume, frequency, susceptibility to discounts, and brand loyalty)\n2- Basis of purchase (price, selling proposition)\nDoing so would allow CRISA to gain information about what demographic attributes are associated with different purchase behaviors and degrees of brand loyalty, and thus deploy promotion budgets more effectively."),

    ("3. Dataset Overview",
     "CRISA has both transaction data (each row is a transaction) and household data (each row is a household), and for the household data it maintains the following information:\n- Demographics of the households (updated annually)\n- Possession of durable goods (car, washing machine, etc., updated annually).\n- an “affluence index” is computed from this information)\n- Purchase data of product categories and brands (updated monthly)"),
    
    ("4. Tools and Techniques Used",
     "- Languages: Python\n- Libraries: pandas, NumPy, scikit-learn, XGBoost, matplotlib, seaborn, joblib\n- Environment: Jupyter Notebook\n- Deployment: Flask API for real-time customer input and prediction"),
    
    ("5. Data Preprocessing",
     "- Categorical variables encoded using one-hot and label encoding\n- Numerical features standardized using StandardScaler\n- Train-test split performed with stratification to preserve segment distribution"),
    
    ("6. Exploratory Data Analysis",
     "- Cluster profiling showed distinct differences in Promo Usage, Purchase Volume, and Loyalty\n- Visualizations helped validate cluster labeling"),
    
    ("7. Model Building",
     "- K-means clustering to identify the ideal number of clusters and assign label to each cluster\n- Interpret segments based on brand loyalty, and purchase behavior\n- Clustering evaluation using silhouette scores and visualization (2D plots using PCA)\n- Supervised classification based on only demographic inputs to classify new or households, using Logistic Regression, Random Forest, and XGBoost\n- Classification evaluation using accuracy, precision, recall, and confusion matrix\n- Feature importance used for explainability"),
    
    ("8. Evaluation Metrics",
     "Best model: XGBoost Classifier\nAccuracy: 0.57\nPrecision by Segment:\n- Loyalists: 0.49\n- Variety Seekers: 0.33\n- Promo Shoppers: 0.64"),
    
    ("9. Key Takeaways",
     "- XGBoost outperformed other models in overall accuracy and segment-specific precision\n- Feature importance revealed that Affluence Index, Food Eating Habits and number of children were key drivers\n- Flask app successfully deployed for real-time labeling of new households"),
    
    ("10. Resources", "")
]

# Add sections with formatting
for title, content in sections:
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, title, ln=True)
    pdf.set_font("Arial", "", 12)
    pdf.multi_cell(0, 8, clean_text(content))
    pdf.ln(2)

# Add clickable links
pdf.set_font("Arial", "B", 12)
pdf.set_text_color(0, 0, 255)
pdf.cell(0, 10, "GitHub Repo", ln=1, link="https://github.com/Sahnoun-A/Customer-Clustering-And-Predicting-Behavior")
pdf.cell(0, 10, "Kaggle Notebook", ln=1, link="https://www.kaggle.com/code/abdelkabirsahnoun/customer-clustering-and-predicting-behavior")
pdf.cell(0, 10, "Flask App Demo", ln=1, link="http://3.149.15.43:8080")

# Reset text color
pdf.set_text_color(0, 0, 0)

# Save PDF
pdf_path = "predicting_customer_behavior_summary.pdf"
pdf.output(pdf_path)

# Generate README content
readme_content = """# Customer Segmentation And Predicting Behavior

## 1. Business Situation
CRISA is an Asian market research agency that specializes in tracking consumer purchase behavior
in consumer goods (both durable and nondurable). In one major research project, CRISA tracks
numerous consumer product categories (e.g., "detergents"), and, within each category, perhaps
dozens of brands. To track purchase behavior, CRISA constituted household panels in over 100
cities and towns in India, covering most of the Indian urban market. The households were carefully
selected using stratified sampling to ensure a representative sample; a subset of 600 records is
analyzed here. The strata were defined on the basis of socioeconomic status and the market (a
collection of cities).

## 2. Key Problems and Objective
CRISA has traditionally segmented markets on the basis of purchaser demographics. They would
now like to segment the market based on two key sets of variables more directly related to the
purchase process and to brand loyalty:
1- Purchase behavior (volume, frequency, susceptibility to discounts, and brand loyalty)
2- Basis of purchase (price, selling proposition)
Doing so would allow CRISA to gain information about what demographic attributes are associated
with different purchase behaviors and degrees of brand loyalty, and thus deploy promotion budgets
more effectively.

## 3. Dataset Overview
CRISA has both transaction data (each row is a transaction) and household data (each row is a
household), and for the household data it maintains the following information:
- Demographics of the households (updated annually)
- Possession of durable goods (car, washing machine, etc., updated annually).
- an "affluence index" is computed from this information)
- Purchase data of product categories and brands (updated monthly)

## 4. Tools and Techniques
- **Languages**: Python
- **Libraries**: pandas, NumPy, scikit-learn, XGBoost, matplotlib, seaborn
- **Environment**: Jupyter Notebook
- **Deployment**: Flask API for real-time predictions

## 5. Data Preprocessing
- Categorical variables encoded using one-hot and label encoding
- Numerical features standardized using StandardScaler
- Train-test split performed with stratification to preserve segment distribution

## 6. Exploratory Data Analysis
- Cluster profiling showed distinct differences in Promo Usage, Purchase Volume, and Loyalty
- Visualizations helped validate cluster labeling

## 7. Model Building
- K-means clustering to identify the ideal number of clusters and assign label to each cluster
- Interpret segments based on brand loyalty, and purchase behavior
- Clustering evaluation using silhouette scores and visualization (2D plots using PCA)
- Supervised classification based on only demographic inputs to classify new or households, using
Logistic Regression, Random Forest, and XGBoost
- Classification evaluation using accuracy, precision, recall, and confusion matrix
- Feature importance used for explainability

## 8. Evaluation Metrics

| Segment           | Precision |
|------------------|-----------|
| Loyalists        | 0.49      |
| Variety Seekers  | 0.33      |
| Promo Shoppers   | 0.64      |

**Overall Accuracy:** 0.57

## 9. Key Takeaways
- XGBoost outperformed other models in overall accuracy and segment-specific precision
- Feature importance revealed that Affluence Index, Food Eating Habits and number of children
were key drivers
- Flask app successfully deployed for real-time labeling of new households

## 10. Resources
- 🗃 [**GitHub Repo**](https://github.com/Sahnoun-A/Customer-Clustering-And-Predicting-Behavior)
- 📘 [**Kaggle Notebook**](https://www.kaggle.com/code/abdelkabirsahnoun/customer-clustering-and-predicting-behavior)
- 🌐 [**Flask App Demo**](http://3.149.15.43:8080)
"""

# Save README
readme_path = "README.md"
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme_content)

pdf_path, readme_path


  pdf.set_font("Arial", "B", 17)
  pdf.set_font("Arial", "B", 14)
  pdf.cell(0, 10, title, ln=True)
  pdf.set_font("Arial", "", 12)
  pdf.set_font("Arial", "B", 12)
  pdf.cell(0, 10, "GitHub Repo", ln=1, link="https://github.com/Sahnoun-A/Customer-Clustering-And-Predicting-Behavior")
  pdf.cell(0, 10, "Kaggle Notebook", ln=1, link="https://www.kaggle.com/code/abdelkabirsahnoun/customer-clustering-and-predicting-behavior")
  pdf.cell(0, 10, "Flask App Demo", ln=1, link="http://3.149.15.43:8080")


('predicting_customer_behavior_summary.pdf', 'README.md')