In [154]:
from fpdf import FPDF

# Create PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

# Title
pdf.set_font("Arial", "B", 17)
pdf.multi_cell(0, 10, "Detecting Fraudulent Transactions Using Machine Learning: A Supervised Learning Approach", align="C")
pdf.ln(5)

# Define sections as tuples of (title, content)
sections = [
    ("1. Problem Statement",
     "Fraudulent transactions cost businesses billions annually. The goal of this project is to build a machine learning model that can accurately classify whether a transaction is fraudulent based on behavioral and transactional features."),
    
    ("2. Dataset Overview",
     "The dataset includes 150,000 e-commerce transactions with features such as user Id, sign up date, transaction date and time, transaction amount, device Id, store, browser, sex, age and IP location. The target variable is a binary indicator of whether the transaction was fraudulent."),
    
    ("3. Tools and Techniques Used",
     "- Languages: Python\n- Libraries: pandas, NumPy, scikit-learn, XGBoost, matplotlib, seaborn, joblib, geoip2, shap\n- Environment: Jupyter Notebook, Anaconda\n- Deployment: Flask API and AWS EC2 instance"),
    
    ("4. Data Preprocessing",
     "- Mapping IP addresses to Countries using GeoLite2 IP geolocation database\n- No missing values in the dataset\n- Categorical variables encoded (one-hot and frequency encoding)\n- Feature scaling using StandardScaler\n- Outlier checking"),
    
    ("5. Exploratory Data Analysis",
     "- Target variable distribution (The dataset is imbalanced)\n- No significant difference in amount variability between Fraudulent and legitimate transactions\n- Fraudulent transactions are concentrated in specific countries\n- Fraud rates vary across browsers and store categories\n- Correlation analysis helped detecting multicollinearity"),
    
    ("6. Model Building",
     "- Baseline Models: Logistic Regression\n- Advanced Models: Random Forest, XGBoost\n- Deep Learning: Autoencoder (unsupervised anomaly detection)\n- Used GridSearchCV for hyperparameter tuning and StratifiedKFold for evaluation"),
    
    ("7. Evaluation Metrics",
     "Model                  | Precision | Recall | AUC-ROC\n"
     "-----------------------|--------------|----------|---------\n"
     "Logistic Reg.       | 0.00         | 0.00    | 0.51\n"
     "Random Forest   | 1.00         | 0.53    | 0.77\n"
     "XGBoost             | 0.98          | 0.53    | 0.77\n"
     "Autoencoder       | 0.14          | 0.08    | 0.63\n\n"
     "Random Forest and XGBoost performed best overall. Autoencoder was useful as an anomaly detector but had lower performance."),
    
    ("8. Key Takeaways",
     "- XGBoost and Random Forest provided the highest fraud detection rate with minimal false positives\n- Feature importance analysis helped explain model behavior to stakeholders\n- SHAP values gave more interpretable, instance-level explanations\n- Demonstrated feasibility of deploying ML for real-time fraud scoring"),
    
    ("9. Resources", "")
]
# Add each section with formatting
for title, text in sections:
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, title, ln=True)
    pdf.set_font("Arial", "", 12)
    pdf.multi_cell(0, 8, text)
    pdf.ln(2)
# Add clickable resource links
pdf.set_text_color(0, 0, 255)
pdf.set_font("Arial", 'U', 12)
pdf.cell(0, 10, "GitHub Repo", ln=1, link="https://github.com/Sahnoun-A/Fraud-Prevention-ML-Project")
pdf.cell(0, 10, "Kaggle Notebook", ln=1, link="https://www.kaggle.com/code/abdelkabirsahnoun/fraud-prevention")
pdf.cell(0, 10, "Flask API Demo", ln=1, link="http://ec2-3-17-9-133.us-east-2.compute.amazonaws.com:8080/")

# Reset color and font
pdf.set_text_color(0, 0, 0)
pdf.set_font("Arial", '', 12)

# Save the file
pdf_path = "fraud_detection_project_summary.pdf"
pdf.output(pdf_path)

# Generate matching README.md content
readme_content = """# Detecting Fraudulent Transactions Using Machine Learning

## 1. Problem Statement
Fraudulent transactions cost businesses billions annually. The goal of this project is to build a machine learning model that can accurately classify whether a transaction is fraudulent based on behavioral and transactional features.

## 2. Dataset Overview
The dataset includes 150,000 e-commerce transactions with features such as user Id, sign up date, transaction date and time, transaction amount, device Id, store, browser, sex, age and IP location. The target variable is a binary indicator of whether the transaction was fraudulent.

## 3. Tools and Techniques Used
- **Languages:** Python
- **Libraries:** pandas, NumPy, scikit-learn, XGBoost, matplotlib, seaborn, joblib, geoip2, shap
- **Environment:** Jupyter Notebook, Anaconda
- **Deployment:** Flask API and AWS EC2 instance

## 4. Data Preprocessing
- Mapped IP addresses to Countries using GeoLite2
- Confirmed no missing values
- Encoded categorical variables (one-hot and frequency encoding)
- Scaled numerical features using StandardScaler
- Outlier detection performed

## 5. Exploratory Data Analysis
- Dataset is imbalanced (few fraudulent cases)
- Fraud concentrated in specific countries
- Some browsers and store categories had higher fraud rates
- Correlation matrix used to detect multicollinearity

## 6. Model Building
- **Baseline Models:** Logistic Regression
- **Advanced Models:** Random Forest, XGBoost
- **Deep Learning:** Autoencoder
- Used GridSearchCV and StratifiedKFold

## 7. Evaluation Metrics

| Model          | Precision | Recall | AUC-ROC |
|----------------|-----------|--------|---------|
| Logistic Reg.  | 0.00      | 0.00   | 0.51    |
| Random Forest  | 1.00      | 0.53   | 0.77    |
| XGBoost        | 0.98      | 0.53   | 0.77    |
| Autoencoder    | 0.14      | 0.08   | 0.63    |

## 8. Key Takeaways
- XGBoost and Random Forest had the best fraud detection performance
- Feature importance analysis helped explain model behavior to stakeholders
- SHAP values used for interpretability
- Demonstrated real-time scoring feasibility via Flask

## 9. Resources
- üóÉ [**GitHub Repo**](https://github.com/Sahnoun-A/Fraud-Prevention-ML-Project)
- üìò [**Kaggle Notebook**](https://www.kaggle.com/code/abdelkabirsahnoun/fraud-prevention)
- üåê [**Flask API Demo**](http://ec2-3-17-9-133.us-east-2.compute.amazonaws.com:8080/)
"""

# Save README
readme_path = "README.md"
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme_content)

pdf_path, readme_path


('fraud_detection_project_summary.pdf', 'README.md')