## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


## Load all Results

In [2]:
# Load evaluation results
evaluation_results = pd.read_csv('results/evaluation_results.csv')

# Load training results
training_results = pd.read_csv('results/training_results.csv')

print("✓ All results loaded!")
print(f"\nModels evaluated: {len(evaluation_results)}")

✓ All results loaded!

Models evaluated: 3


## Generate main project report

In [3]:
print("=" * 70)
print("GENERATING MAIN PROJECT REPORT")
print("=" * 70)

# Create report
report_path = 'results/PROJECT_REPORT.txt'

with open(report_path, 'w', encoding='utf-8') as f:
    # Header
    f.write("=" * 80 + "\n")
    f.write("DEVELOPMENT OF AN INTELLIGENT SYSTEM FOR DIABETES CLASSIFICATION\n")
    f.write("USING MACHINE LEARNING: A COMPARATIVE ANALYSIS\n")
    f.write("=" * 80 + "\n")
    f.write(f"Report Generated: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}\n")
    f.write(f"Student: UGOARU KINGSLEY NWACHUKWU (20201249402)\n")
    f.write(f"Department: Information Technology\n")
    f.write(f"Institution: Federal University of Technology, Owerri\n")
    f.write(f"Supervisor: Mrs Vivian Mbamala\n\n")
    
    # Executive Summary
    f.write("=" * 80 + "\n")
    f.write("EXECUTIVE SUMMARY\n")
    f.write("=" * 80 + "\n\n")
    f.write("This project developed and compared three machine learning algorithms for\n")
    f.write("diabetes classification: Naïve Bayes, Support Vector Machine (SVM), and\n")
    f.write("Decision Tree. The system was trained and evaluated on the Pima Indians\n")
    f.write("Diabetes Database containing 768 patient records.\n\n")
    
    best_idx = evaluation_results['F1-Score'].idxmax()
    best_model = evaluation_results.loc[best_idx, 'Model']
    best_acc = evaluation_results.loc[best_idx, 'Accuracy']
    best_f1 = evaluation_results.loc[best_idx, 'F1-Score']
    
    f.write(f"The {best_model} model achieved the best overall performance with an\n")
    f.write(f"accuracy of {best_acc*100:.2f}% and F1-Score of {best_f1:.4f}, making it the most\n")
    f.write(f"suitable for diabetes classification in clinical settings.\n\n")
    
    # Section 1: Dataset Information
    f.write("=" * 80 + "\n")
    f.write("SECTION 1: DATASET INFORMATION\n")
    f.write("=" * 80 + "\n\n")
    f.write("Dataset Name: Pima Indians Diabetes Database\n")
    f.write("Source: UCI Machine Learning Repository\n")
    f.write("Total Records: 768 female patients\n")
    f.write("Age Range: 21 years and above\n")
    f.write("Features: 8 clinical measurements\n")
    f.write("Target Variable: Outcome (0=Non-Diabetic, 1=Diabetic)\n\n")
    
    f.write("Feature Description:\n")
    f.write("  1. Pregnancies: Number of times pregnant\n")
    f.write("  2. Glucose: Plasma glucose concentration (mg/dL)\n")
    f.write("  3. BloodPressure: Diastolic blood pressure (mm Hg)\n")
    f.write("  4. SkinThickness: Triceps skin fold thickness (mm)\n")
    f.write("  5. Insulin: 2-Hour serum insulin (mu U/ml)\n")
    f.write("  6. BMI: Body mass index (weight in kg/(height in m)^2)\n")
    f.write("  7. DiabetesPedigreeFunction: Diabetes pedigree function\n")
    f.write("  8. Age: Age in years\n\n")
    
    f.write("Class Distribution:\n")
    f.write("  Non-Diabetic (0): 500 patients (65.1%)\n")
    f.write("  Diabetic (1): 268 patients (34.9%)\n\n")
    
    # Section 2: Data Preprocessing
    f.write("=" * 80 + "\n")
    f.write("SECTION 2: DATA PREPROCESSING METHODOLOGY\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("2.1 HANDLING MISSING VALUES\n")
    f.write("-" * 80 + "\n")
    f.write("Several features contained zero values that were medically impossible:\n")
    f.write("  • Glucose: 5 zero values (0.65%)\n")
    f.write("  • BloodPressure: 35 zero values (4.56%)\n")
    f.write("  • SkinThickness: 227 zero values (29.56%)\n")
    f.write("  • Insulin: 374 zero values (48.70%)\n")
    f.write("  • BMI: 11 zero values (1.43%)\n\n")
    
    f.write("Treatment:\n")
    f.write("  1. Zero values were replaced with NaN (Not a Number)\n")
    f.write("  2. Missing values were imputed using median values\n")
    f.write("  3. Median imputation was chosen for robustness against outliers\n\n")
    
    f.write("2.2 DATA SPLITTING\n")
    f.write("-" * 80 + "\n")
    f.write("  Training Set: 614 samples (80%)\n")
    f.write("  Testing Set: 154 samples (20%)\n")
    f.write("  Stratified split: Maintained class distribution in both sets\n")
    f.write("  Random State: 42 (for reproducibility)\n\n")
    
    f.write("2.3 FEATURE SCALING\n")
    f.write("-" * 80 + "\n")
    f.write("  Method: StandardScaler (Z-score normalization)\n")
    f.write("  Purpose: Standardize features to mean=0 and std=1\n")
    f.write("  Result: All features transformed to comparable scales\n")
    f.write("  This prevents features with larger ranges from dominating the models\n\n")
    
    # Section 3: Model Development
    f.write("=" * 80 + "\n")
    f.write("SECTION 3: MACHINE LEARNING MODELS DEVELOPED\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("Three supervised learning algorithms were implemented and compared:\n\n")
    
    f.write("3.1 NAÏVE BAYES (GAUSSIAN)\n")
    f.write("-" * 80 + "\n")
    f.write("Algorithm Type: Probabilistic classifier\n")
    f.write("Assumption: Features are conditionally independent given the class\n")
    f.write("Mathematical Basis: Bayes' Theorem\n")
    f.write("Training Approach: No hyperparameter tuning required\n")
    f.write("Cross-Validation: 5-fold CV\n\n")
    
    nb_row = training_results[training_results['Model'] == 'Naive Bayes'].iloc[0]
    f.write(f"Training Results:\n")
    f.write(f"  Mean CV Accuracy: {nb_row['Mean CV Accuracy']:.4f}\n")
    f.write(f"  Training Time: {nb_row['Training Time (sec)']:.2f} seconds\n\n")
    
    f.write("Advantages:\n")
    f.write("  • Fast training and prediction\n")
    f.write("  • Works well with small datasets\n")
    f.write("  • Probabilistic interpretation\n")
    f.write("  • Low computational requirements\n\n")
    
    f.write("3.2 SUPPORT VECTOR MACHINE (SVM)\n")
    f.write("-" * 80 + "\n")
    f.write("Algorithm Type: Margin-based classifier\n")
    f.write("Objective: Find optimal hyperplane that maximizes margin between classes\n")
    f.write("Training Approach: Grid Search with Cross-Validation\n\n")
    
    svm_row = training_results[training_results['Model'] == 'SVM'].iloc[0]
    f.write(f"Hyperparameter Search Space:\n")
    f.write(f"  • C (Regularization): [0.1, 1, 10, 100]\n")
    f.write(f"  • Kernel: ['linear', 'rbf', 'poly']\n")
    f.write(f"  • Gamma: ['scale', 'auto', 0.001, 0.01]\n")
    f.write(f"  Total Combinations Tested: 48\n\n")
    
    f.write(f"Training Results:\n")
    f.write(f"  Best CV Accuracy: {svm_row['Mean CV Accuracy']:.4f}\n")
    f.write(f"  Training Time: {svm_row['Training Time (sec)']:.2f} seconds\n\n")
    
    f.write("Advantages:\n")
    f.write("  • Effective in high-dimensional spaces\n")
    f.write("  • Memory efficient (uses support vectors)\n")
    f.write("  • Versatile (different kernel functions)\n")
    f.write("  • Strong generalization capability\n\n")
    
    f.write("3.3 DECISION TREE\n")
    f.write("-" * 80 + "\n")
    f.write("Algorithm Type: Tree-based classifier\n")
    f.write("Method: Recursive binary splitting\n")
    f.write("Training Approach: Grid Search with Cross-Validation\n\n")
    
    dt_row = training_results[training_results['Model'] == 'Decision Tree'].iloc[0]
    f.write(f"Hyperparameter Search Space:\n")
    f.write(f"  • Max Depth: [3, 5, 7, 10, None]\n")
    f.write(f"  • Min Samples Split: [2, 5, 10, 20]\n")
    f.write(f"  • Min Samples Leaf: [1, 2, 4]\n")
    f.write(f"  • Criterion: ['gini', 'entropy']\n")
    f.write(f"  Total Combinations Tested: 160\n\n")
    
    f.write(f"Training Results:\n")
    f.write(f"  Best CV Accuracy: {dt_row['Mean CV Accuracy']:.4f}\n")
    f.write(f"  Training Time: {dt_row['Training Time (sec)']:.2f} seconds\n\n")
    
    f.write("Advantages:\n")
    f.write("  • Easy to understand and interpret\n")
    f.write("  • Requires little data preprocessing\n")
    f.write("  • Can handle non-linear relationships\n")
    f.write("  • Visual representation possible\n\n")
    
    # Section 4: Model Evaluation
    f.write("=" * 80 + "\n")
    f.write("SECTION 4: MODEL EVALUATION AND PERFORMANCE COMPARISON\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("4.1 EVALUATION METHODOLOGY\n")
    f.write("-" * 80 + "\n")
    f.write("All models were evaluated on an independent test set (154 samples) that was\n")
    f.write("not used during training. The following metrics were calculated:\n\n")
    
    f.write("Metrics Used:\n")
    f.write("  • Accuracy: Overall correctness of predictions\n")
    f.write("  • Precision: Proportion of positive predictions that are correct\n")
    f.write("  • Recall: Proportion of actual positives that are identified\n")
    f.write("  • F1-Score: Harmonic mean of precision and recall\n\n")
    
    f.write("4.2 COMPARATIVE PERFORMANCE RESULTS\n")
    f.write("-" * 80 + "\n\n")
    
    # Performance table
    f.write("Model Performance Summary:\n")
    f.write("=" * 80 + "\n")
    f.write(f"{'Model':<20} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}\n")
    f.write("=" * 80 + "\n")
    
    for _, row in evaluation_results.iterrows():
        f.write(f"{row['Model']:<20} "
                f"{row['Accuracy']:<12.4f} "
                f"{row['Precision']:<12.4f} "
                f"{row['Recall']:<12.4f} "
                f"{row['F1-Score']:<12.4f}\n")
    f.write("=" * 80 + "\n\n")
    
    # Detailed analysis for each model
    f.write("4.3 DETAILED ANALYSIS BY MODEL\n")
    f.write("-" * 80 + "\n\n")
    
    for _, row in evaluation_results.iterrows():
        f.write(f"{row['Model'].upper()}\n")
        f.write("-" * 80 + "\n")
        f.write(f"Accuracy: {row['Accuracy']*100:.2f}%\n")
        f.write(f"Precision: {row['Precision']:.4f}\n")
        f.write(f"Recall: {row['Recall']:.4f}\n")
        f.write(f"F1-Score: {row['F1-Score']:.4f}\n\n")
        
        f.write(f"Confusion Matrix:\n")
        f.write(f"  True Negatives (TN):  {row['True Negatives']:3d} "
                f"(Correctly predicted Non-Diabetic)\n")
        f.write(f"  False Positives (FP): {row['False Positives']:3d} "
                f"(Incorrectly predicted Diabetic)\n")
        f.write(f"  False Negatives (FN): {row['False Negatives']:3d} "
                f"(Missed Diabetic cases)\n")
        f.write(f"  True Positives (TP):  {row['True Positives']:3d} "
                f"(Correctly predicted Diabetic)\n\n")
        
        # Clinical interpretation
        total_diabetic = row['True Positives'] + row['False Negatives']
        detection_rate = (row['True Positives'] / total_diabetic) * 100 if total_diabetic > 0 else 0
        
        f.write(f"Clinical Interpretation:\n")
        f.write(f"  • Detection Rate: {detection_rate:.1f}% of diabetic patients identified\n")
        f.write(f"  • Missed Cases: {row['False Negatives']} diabetic patients not detected\n")
        f.write(f"  • False Alarms: {row['False Positives']} healthy patients flagged\n\n")
    
    # Section 5: Best Model Selection
    f.write("=" * 80 + "\n")
    f.write("SECTION 5: BEST MODEL SELECTION AND JUSTIFICATION\n")
    f.write("=" * 80 + "\n\n")
    
    f.write(f"Selected Model: {best_model}\n")
    f.write("-" * 80 + "\n\n")
    
    best_row = evaluation_results.loc[best_idx]
    
    f.write(f"Performance Summary:\n")
    f.write(f"  Accuracy:  {best_row['Accuracy']*100:.2f}%\n")
    f.write(f"  Precision: {best_row['Precision']:.4f}\n")
    f.write(f"  Recall:    {best_row['Recall']:.4f}\n")
    f.write(f"  F1-Score:  {best_row['F1-Score']:.4f}\n\n")
    
    f.write(f"Selection Criteria:\n")
    f.write(f"The {best_model} was selected as the best model based on the F1-Score metric,\n")
    f.write(f"which provides the best balance between precision and recall. In medical\n")
    f.write(f"diagnosis applications, both false positives and false negatives have\n")
    f.write(f"significant consequences:\n\n")
    
    f.write(f"False Negatives (Missed Diabetic Cases): {best_row['False Negatives']}\n")
    f.write(f"  Risk: Patients remain undiagnosed and untreated, leading to potential\n")
    f.write(f"  complications such as kidney failure, vision loss, and cardiovascular issues.\n\n")
    
    f.write(f"False Positives (Incorrect Diabetic Diagnosis): {best_row['False Positives']}\n")
    f.write(f"  Risk: Unnecessary anxiety, additional testing, and potential treatment\n")
    f.write(f"  side effects for healthy individuals.\n\n")
    
    f.write(f"The {best_model} achieved the optimal trade-off between these two types of\n")
    f.write(f"errors, making it the most suitable for clinical deployment.\n\n")
    
    # Section 6: Conclusion
    f.write("=" * 80 + "\n")
    f.write("SECTION 6: CONCLUSION\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("This comparative study successfully developed and evaluated three machine\n")
    f.write("learning algorithms for diabetes classification. Key findings include:\n\n")
    
    f.write("Key Achievements:\n")
    f.write(f"  ✓ Successfully preprocessed 768 patient records\n")
    f.write(f"  ✓ Trained and optimized three ML algorithms\n")
    f.write(f"  ✓ Achieved {best_row['Accuracy']*100:.2f}% accuracy with the best model\n")
    f.write(f"  ✓ Conducted comprehensive comparative analysis\n")
    f.write(f"  ✓ Generated detailed performance metrics and visualizations\n\n")
    
    f.write("Comparative Insights:\n")
    
    # Rank models
    ranked = evaluation_results.sort_values('F1-Score', ascending=False)
    f.write(f"  1st Place: {ranked.iloc[0]['Model']} "
            f"(F1-Score: {ranked.iloc[0]['F1-Score']:.4f})\n")
    f.write(f"  2nd Place: {ranked.iloc[1]['Model']} "
            f"(F1-Score: {ranked.iloc[1]['F1-Score']:.4f})\n")
    f.write(f"  3rd Place: {ranked.iloc[2]['Model']} "
            f"(F1-Score: {ranked.iloc[2]['F1-Score']:.4f})\n\n")
    
    f.write("Clinical Relevance:\n")
    f.write("  The developed system demonstrates the potential of machine learning in\n")
    f.write("  supporting early diabetes detection. With proper validation and integration\n")
    f.write("  into healthcare workflows, such systems can:\n")
    f.write("    • Reduce diagnostic delays\n")
    f.write("    • Support healthcare workers in resource-limited settings\n")
    f.write("    • Enable large-scale screening programs\n")
    f.write("    • Improve patient outcomes through early intervention\n\n")
    
    # Section 7: Recommendations
    f.write("=" * 80 + "\n")
    f.write("SECTION 7: RECOMMENDATIONS\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("For Future Research:\n")
    f.write("  1. Collect and incorporate local Nigerian patient data for model retraining\n")
    f.write("  2. Explore ensemble methods combining multiple algorithms\n")
    f.write("  3. Investigate deep learning approaches for feature extraction\n")
    f.write("  4. Include temporal data (glucose monitoring over time)\n")
    f.write("  5. Conduct clinical validation studies with healthcare professionals\n\n")
    
    f.write("For Implementation:\n")
    f.write("  1. Develop user-friendly interface for healthcare workers\n")
    f.write("  2. Integrate with electronic health record systems\n")
    f.write("  3. Provide explainable predictions for clinical decision support\n")
    f.write("  4. Implement continuous model monitoring and updating\n")
    f.write("  5. Conduct pilot testing in selected healthcare facilities\n\n")
    
    f.write("For Healthcare Policy:\n")
    f.write("  1. Establish guidelines for AI-assisted diabetes screening\n")
    f.write("  2. Ensure data privacy and security compliance\n")
    f.write("  3. Train healthcare workers on ML system interpretation\n")
    f.write("  4. Develop quality assurance protocols\n")
    f.write("  5. Create regulatory framework for medical ML applications\n\n")
    
    # Section 8: Limitations
    f.write("=" * 80 + "\n")
    f.write("SECTION 8: LIMITATIONS OF THE STUDY\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("1. Dataset Limitations:\n")
    f.write("   • Dataset based on Pima Indian population (may not generalize to Nigerians)\n")
    f.write("   • Limited sample size (768 records)\n")
    f.write("   • Class imbalance (65% non-diabetic, 35% diabetic)\n")
    f.write("   • Significant missing data in some features (especially Insulin)\n\n")
    
    f.write("2. Model Limitations:\n")
    f.write("   • Binary classification only (diabetic vs non-diabetic)\n")
    f.write("   • Does not distinguish between Type 1 and Type 2 diabetes\n")
    f.write("   • Static prediction (no temporal monitoring)\n")
    f.write("   • Limited to 8 clinical features\n\n")
    
    f.write("3. Validation Limitations:\n")
    f.write("   • No external validation on independent datasets\n")
    f.write("   • No clinical trial or real-world testing\n")
    f.write("   • No comparison with expert clinician diagnoses\n\n")
    
    f.write("4. Implementation Limitations:\n")
    f.write("   • Requires digital infrastructure\n")
    f.write("   • Dependent on data quality and completeness\n")
    f.write("   • May require periodic retraining\n\n")
    
    # References Section
    f.write("=" * 80 + "\n")
    f.write("SECTION 9: REFERENCES\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("Dataset:\n")
    f.write("  Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S.\n")
    f.write("  (1988). Using the ADAP learning algorithm to forecast the onset of diabetes\n")
    f.write("  mellitus. Proceedings of the Symposium on Computer Applications and Medical\n")
    f.write("  Care, IEEE Computer Society Press, 261-265.\n\n")
    
    f.write("Libraries and Tools:\n")
    f.write("  • Scikit-learn: Pedregosa et al. (2011). Journal of Machine Learning Research\n")
    f.write("  • Pandas: McKinney (2010). Data Structures for Statistical Computing in Python\n")
    f.write("  • NumPy: Harris et al. (2020). Nature\n")
    f.write("  • Matplotlib: Hunter (2007). Computing in Science & Engineering\n\n")
    
    # Appendices
    f.write("=" * 80 + "\n")
    f.write("APPENDICES\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("Appendix A: Generated Visualizations\n")
    f.write("  • Confusion matrices for all models\n")
    f.write("  • Performance comparison charts\n")
    f.write("  • ROC curves\n")
    f.write("  • Performance dashboard\n\n")
    
    f.write("Appendix B: Model Files\n")
    f.write("  • Naive_Bayes.pkl\n")
    f.write("  • SVM.pkl\n")
    f.write("  • Decision_Tree.pkl\n")
    f.write("  • best_model.pkl\n")
    f.write("  • scaler.pkl\n\n")
    
    f.write("Appendix C: Data Files\n")
    f.write("  • X_train_scaled.csv\n")
    f.write("  • X_test_scaled.csv\n")
    f.write("  • y_train.csv\n")
    f.write("  • y_test.csv\n\n")
    
    # Footer
    f.write("=" * 80 + "\n")
    f.write("END OF REPORT\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Report generated on: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}\n")
    f.write(f"Total pages: Approximately 15-20 pages when formatted\n\n")
    f.write("For questions or clarifications, contact:\n")
    f.write("UGOARU KINGSLEY NWACHUKWU\n")
    f.write("Department of Information Technology\n")
    f.write("Federal University of Technology, Owerri\n")

print(f"\n✓ Main report generated: {report_path}")
print(f"  File size: {os.path.getsize(report_path) / 1024:.2f} KB")

GENERATING MAIN PROJECT REPORT

✓ Main report generated: results/PROJECT_REPORT.txt
  File size: 15.39 KB


## Executive Summary

In [5]:
print("\n" + "=" * 70)
print("GENERATING EXECUTIVE SUMMARY")
print("=" * 70)

summary_path = 'results/EXECUTIVE_SUMMARY.txt'

with open(summary_path, 'w', encoding='utf-8') as f:
    f.write("=" * 80 + "\n")
    f.write("EXECUTIVE SUMMARY\n")
    f.write("Development of an Intelligent System for Diabetes Classification\n")
    f.write("Using Machine Learning: A Comparative Analysis\n")
    f.write("=" * 80 + "\n\n")
    
    f.write(f"Student: UGOARU KINGSLEY NWACHUKWU (20201249402)\n")
    f.write(f"Date: {datetime.now().strftime('%B %Y')}\n\n")
    
    f.write("PROBLEM STATEMENT\n")
    f.write("-" * 80 + "\n")
    f.write("Diabetes is a growing health concern in Nigeria, with many cases going\n")
    f.write("undiagnosed until serious complications arise. Early detection is critical\n")
    f.write("for effective management, but traditional screening methods are time-consuming\n")
    f.write("and require specialized medical expertise.\n\n")
    
    f.write("OBJECTIVE\n")
    f.write("-" * 80 + "\n")
    f.write("To develop and compare three machine learning algorithms (Naïve Bayes, SVM,\n")
    f.write("and Decision Tree) for automated diabetes classification, identifying the\n")
    f.write("most effective approach for clinical application.\n\n")
    
    f.write("METHODOLOGY\n")
    f.write("-" * 80 + "\n")
    f.write("• Dataset: 768 patient records from Pima Indians Diabetes Database\n")
    f.write("• Features: 8 clinical measurements (glucose, BMI, age, etc.)\n")
    f.write("• Preprocessing: Missing value imputation, feature scaling\n")
    f.write("• Training: 80% training, 20% testing split\n")
    f.write("• Optimization: Grid Search with 5-fold cross-validation\n")
    f.write("• Evaluation: Accuracy, Precision, Recall, F1-Score\n\n")
    
    f.write("KEY FINDINGS\n")
    f.write("-" * 80 + "\n\n")
    
    f.write("Model Performance Comparison:\n")
    for _, row in evaluation_results.iterrows():
        f.write(f"  {row['Model']:15s}: {row['Accuracy']*100:.2f}% accuracy, "
                f"{row['F1-Score']:.4f} F1-Score\n")
    
    f.write(f"\nBest Performing Model: {best_model}\n")
    f.write(f"  • Accuracy: {best_row['Accuracy']*100:.2f}%\n")
    f.write(f"  • F1-Score: {best_row['F1-Score']:.4f}\n")
    f.write(f"  • Correctly identified {best_row['True Positives']} diabetic cases\n")
    f.write(f"  • Missed {best_row['False Negatives']} diabetic cases\n\n")
    
    f.write("SIGNIFICANCE\n")
    f.write("-" * 80 + "\n")
    f.write("• Demonstrates feasibility of ML for diabetes screening\n")
    f.write("• Provides objective comparison of three algorithms\n")
    f.write("• Achieves clinically relevant accuracy levels\n")
    f.write("• Can support healthcare workers in resource-limited settings\n")
    f.write("• Enables faster, more accessible diabetes screening\n\n")
    
    f.write("RECOMMENDATIONS\n")
    f.write("-" * 80 + "\n")
    f.write("1. Validate with local Nigerian patient data\n")
    f.write("2. Conduct clinical trials in healthcare facilities\n")
    f.write("3. Integrate with existing health information systems\n")
    f.write("4. Train healthcare workers on system interpretation\n")
    f.write("5. Establish continuous monitoring and model updates\n\n")
    
    f.write("CONCLUSION\n")
    f.write("-" * 80 + "\n")
    f.write(f"This study successfully developed and compared three ML algorithms for\n")
    f.write(f"diabetes classification. The {best_model} emerged as the best performer,\n")
    f.write(f"achieving {best_row['Accuracy']*100:.2f}% accuracy. The system demonstrates\n")
    f.write(f"potential for supporting early diabetes detection in Nigerian healthcare\n")
    f.write(f"settings, pending further validation and clinical integration.\n\n")
    
    f.write("=" * 80 + "\n")
    f.write("END OF EXECUTIVE SUMMARY\n")
    f.write("=" * 80 + "\n")

print(f"\n✓ Executive summary generated: {summary_path}")
print(f"  File size: {os.path.getsize(summary_path) / 1024:.2f} KB")


GENERATING EXECUTIVE SUMMARY

✓ Executive summary generated: results/EXECUTIVE_SUMMARY.txt
  File size: 3.24 KB


## Methodology

In [6]:
print("\n" + "=" * 70)
print("GENERATING METHODOLOGY DOCUMENTATION")
print("=" * 70)

methodology_path = 'results/METHODOLOGY.txt'

with open(methodology_path, 'w', encoding='utf-8') as f:
    f.write("=" * 80 + "\n")
    f.write("DETAILED METHODOLOGY DOCUMENTATION\n")
    f.write("Diabetes Classification Using Machine Learning\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("CHAPTER 3: METHODOLOGY (For Thesis)\n\n")
    
    # 3.1 Research Design
    f.write("3.1 RESEARCH DESIGN\n")
    f.write("=" * 80 + "\n\n")
    f.write("This study adopted a quantitative comparative research design using supervised\n")
    f.write("machine learning techniques. The research followed a systematic approach:\n\n")
    f.write("1. Data Collection and Exploration\n")
    f.write("2. Data Preprocessing and Preparation\n")
    f.write("3. Model Development and Training\n")
    f.write("4. Model Evaluation and Comparison\n")
    f.write("5. Results Analysis and Interpretation\n\n")
    
    # 3.2 Dataset
    f.write("3.2 DATASET DESCRIPTION\n")
    f.write("=" * 80 + "\n\n")
    f.write("Source: Pima Indians Diabetes Database (UCI Machine Learning Repository)\n")
    f.write("Collection Method: Secondary data from medical records\n")
    f.write("Population: Female patients of Pima Indian heritage\n")
    f.write("Age Criterion: 21 years and above\n")
    f.write("Sample Size: 768 patients\n")
    f.write("Time Period: Collected between 1960s-1990s\n\n")
    
    f.write("Feature Variables:\n")
    features_desc = [
        ("Pregnancies", "Number of times pregnant", "Numerical (0-17)"),
        ("Glucose", "Plasma glucose concentration", "Numerical (0-199 mg/dL)"),
        ("BloodPressure", "Diastolic blood pressure", "Numerical (0-122 mm Hg)"),
        ("SkinThickness", "Triceps skin fold thickness", "Numerical (0-99 mm)"),
        ("Insulin", "2-Hour serum insulin", "Numerical (0-846 mu U/ml)"),
        ("BMI", "Body mass index", "Numerical (0-67.1)"),
        ("DiabetesPedigreeFunction", "Genetic predisposition score", "Numerical (0.078-2.42)"),
        ("Age", "Age in years", "Numerical (21-81)")
    ]
    
    for name, desc, range_val in features_desc:
        f.write(f"  • {name:25s}: {desc:35s} [{range_val}]\n")
    
    f.write(f"\nTarget Variable:\n")
    f.write(f"  • Outcome: Binary classification (0=Non-Diabetic, 1=Diabetic)\n\n")
    
    # 3.3 Data Preprocessing
    f.write("3.3 DATA PREPROCESSING TECHNIQUES\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("3.3.1 Missing Value Treatment\n")
    f.write("-" * 80 + "\n")
    f.write("Problem Identified:\n")
    f.write("  Several features contained zero values that are medically impossible,\n")
    f.write("  indicating missing or erroneous data entries.\n\n")
    
    f.write("Solution Implemented:\n")
    f.write("  Step 1: Identify affected features\n")
    f.write("          (Glucose, BloodPressure, SkinThickness, Insulin, BMI)\n")
    f.write("  Step 2: Replace zero values with NaN (Not a Number)\n")
    f.write("  Step 3: Calculate median value for each feature\n")
    f.write("  Step 4: Impute missing values using median\n\n")
    
    f.write("Justification for Median Imputation:\n")
    f.write("  • Robust to outliers (unlike mean)\n")
    f.write("  • Preserves data distribution\n")
    f.write("  • Widely accepted in medical data preprocessing\n")
    f.write("  • Does not introduce extreme values\n\n")
    
    f.write("3.3.2 Data Splitting\n")
    f.write("-" * 80 + "\n")
    f.write("Method: Stratified train-test split\n")
    f.write("Ratio: 80% training, 20% testing\n")
    f.write("Random State: 42 (for reproducibility)\n\n")
    
    f.write("Stratification Purpose:\n")
    f.write("  Ensures both training and testing sets maintain the same proportion\n")
    f.write("  of diabetic to non-diabetic cases as the original dataset.\n\n")
    
    f.write("Result:\n")
    f.write("  • Training Set: 614 samples\n")
    f.write("  • Testing Set: 154 samples\n")
    f.write("  • Class distribution preserved in both sets\n\n")
    
    f.write("3.3.3 Feature Scaling\n")
    f.write("-" * 80 + "\n")
    f.write("Method: StandardScaler (Z-score normalization)\n\n")
    
    f.write("Mathematical Formula:\n")
    f.write("  z = (x - μ) / σ\n")
    f.write("  Where:\n")
    f.write("    z = standardized value\n")
    f.write("    x = original value\n")
    f.write("    μ = mean of feature\n")
    f.write("    σ = standard deviation of feature\n\n")
    
    f.write("Purpose:\n")
    f.write("  • Transforms all features to have mean = 0 and std = 1\n")
    f.write("  • Prevents features with larger ranges from dominating the model\n")
    f.write("  • Improves convergence speed during training\n")
    f.write("  • Required for distance-based algorithms (SVM, KNN)\n\n")
    
    f.write("Implementation:\n")
    f.write("  • Scaler fitted on training data only\n")
    f.write("  • Same transformation applied to test data\n")
    f.write("  • Scaler saved for future use on new data\n\n")
    
    # 3.4 Model Development
    f.write("3.4 MODEL DEVELOPMENT AND TRAINING\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("3.4.1 Naïve Bayes Classifier\n")
    f.write("-" * 80 + "\n\n")
    f.write("Theoretical Foundation:\n")
    f.write("  Based on Bayes' Theorem with the 'naive' assumption of feature independence.\n\n")
    
    f.write("Mathematical Model:\n")
    f.write("  P(C|X) = [P(X|C) × P(C)] / P(X)\n")
    f.write("  Where:\n")
    f.write("    C = Class (Diabetic or Non-Diabetic)\n")
    f.write("    X = Feature vector\n")
    f.write("    P(C|X) = Posterior probability\n")
    f.write("    P(X|C) = Likelihood\n")
    f.write("    P(C) = Prior probability\n")
    f.write("    P(X) = Evidence\n\n")
    
    f.write("Implementation Details:\n")
    f.write("  Algorithm: GaussianNB (assumes Gaussian distribution)\n")
    f.write("  Library: scikit-learn\n")
    f.write("  Hyperparameters: Default (no tuning required)\n")
    f.write("  Validation: 5-fold cross-validation\n\n")
    
    f.write("Advantages:\n")
    f.write("  • Fast training and prediction\n")
    f.write("  • Works well with small datasets\n")
    f.write("  • Probabilistic interpretation of results\n")
    f.write("  • Low memory requirements\n\n")
    
    f.write("3.4.2 Support Vector Machine (SVM)\n")
    f.write("-" * 80 + "\n\n")
    f.write("Theoretical Foundation:\n")
    f.write("  Finds the optimal hyperplane that maximizes the margin between classes.\n\n")
    
    f.write("Mathematical Model:\n")
    f.write("  Objective: Minimize ||w||² subject to y_i(w·x_i + b) ≥ 1\n")
    f.write("  Where:\n")
    f.write("    w = weight vector (normal to hyperplane)\n")
    f.write("    b = bias term\n")
    f.write("    x_i = feature vector\n")
    f.write("    y_i = class label (-1 or +1)\n\n")
    
    f.write("Hyperparameter Optimization:\n")
    f.write("  Method: Grid Search with Cross-Validation\n")
    f.write("  Search Space:\n")
    f.write("    • C (Regularization): [0.1, 1, 10, 100]\n")
    f.write("    • kernel: ['linear', 'rbf', 'poly']\n")
    f.write("    • gamma: ['scale', 'auto', 0.001, 0.01]\n")
    f.write("  Total Combinations: 48\n")
    f.write("  Validation: 5-fold cross-validation\n")
    f.write("  Scoring Metric: Accuracy\n\n")
    
    f.write("Hyperparameter Descriptions:\n")
    f.write("  • C: Controls trade-off between margin size and training error\n")
    f.write("  • kernel: Defines the decision boundary shape\n")
    f.write("  • gamma: Influences the reach of individual training samples\n\n")
    
    f.write("3.4.3 Decision Tree Classifier\n")
    f.write("-" * 80 + "\n\n")
    f.write("Theoretical Foundation:\n")
    f.write("  Recursive binary splitting based on feature values to create a tree\n")
    f.write("  structure that separates classes.\n\n")
    
    f.write("Splitting Criteria:\n")
    f.write("  Gini Impurity: Gini = 1 - Σ(p_i)²\n")
    f.write("  Entropy: H = -Σ(p_i × log₂(p_i))\n")
    f.write("  Where p_i is the proportion of class i at a node\n\n")
    
    f.write("Hyperparameter Optimization:\n")
    f.write("  Method: Grid Search with Cross-Validation\n")
    f.write("  Search Space:\n")
    f.write("    • max_depth: [3, 5, 7, 10, None]\n")
    f.write("    • min_samples_split: [2, 5, 10, 20]\n")
    f.write("    • min_samples_leaf: [1, 2, 4]\n")
    f.write("    • criterion: ['gini', 'entropy']\n")
    f.write("  Total Combinations: 160\n")
    f.write("  Validation: 5-fold cross-validation\n\n")
    
    f.write("Hyperparameter Descriptions:\n")
    f.write("  • max_depth: Maximum depth of the tree (controls overfitting)\n")
    f.write("  • min_samples_split: Minimum samples required to split a node\n")
    f.write("  • min_samples_leaf: Minimum samples required in each leaf\n")
    f.write("  • criterion: Function to measure split quality\n\n")
    
    # 3.5 Model Evaluation
    f.write("3.5 MODEL EVALUATION METHODOLOGY\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("3.5.1 Evaluation Metrics\n")
    f.write("-" * 80 + "\n\n")
    
    f.write("Accuracy:\n")
    f.write("  Formula: (TP + TN) / (TP + TN + FP + FN)\n")
    f.write("  Interpretation: Overall proportion of correct predictions\n\n")
    
    f.write("Precision:\n")
    f.write("  Formula: TP / (TP + FP)\n")
    f.write("  Interpretation: Of all positive predictions, how many are correct?\n")
    f.write("  Clinical Meaning: Reliability of diabetic diagnosis\n\n")
    
    f.write("Recall (Sensitivity):\n")
    f.write("  Formula: TP / (TP + FN)\n")
    f.write("  Interpretation: Of all actual positives, how many are detected?\n")
    f.write("  Clinical Meaning: Ability to identify diabetic patients\n\n")
    
    f.write("F1-Score:\n")
    f.write("  Formula: 2 × (Precision × Recall) / (Precision + Recall)\n")
    f.write("  Interpretation: Harmonic mean of precision and recall\n")
    f.write("  Clinical Meaning: Balance between false alarms and missed cases\n\n")
    
    f.write("Confusion Matrix Components:\n")
    f.write("  • True Positive (TP): Correctly identified diabetic cases\n")
    f.write("  • True Negative (TN): Correctly identified non-diabetic cases\n")
    f.write("  • False Positive (FP): Non-diabetic incorrectly labeled diabetic\n")
    f.write("  • False Negative (FN): Diabetic cases missed by the model\n\n")
    
    f.write("3.5.2 Cross-Validation Strategy\n")
    f.write("-" * 80 + "\n")
    f.write("Method: K-Fold Cross-Validation (k=5)\n\n")
    f.write("Process:\n")
    f.write("  1. Split training data into 5 equal folds\n")
    f.write("  2. Train on 4 folds, validate on 1 fold\n")
    f.write("  3. Rotate and repeat 5 times\n")
    f.write("  4. Average the 5 validation scores\n\n")
    f.write("Purpose:\n")
    f.write("  • Assess model stability and consistency\n")
    f.write("  • Reduce variance in performance estimates\n")
    f.write("  • Detect overfitting\n\n")
    
    f.write("3.5.3 Model Comparison Framework\n")
    f.write("-" * 80 + "\n")
    f.write("Primary Criterion: F1-Score\n")
    f.write("  Rationale: Balances precision and recall, critical in medical diagnosis\n\n")
    
    f.write("Secondary Criteria:\n")
    f.write("  • Accuracy: Overall performance\n")
    f.write("  • Precision: Cost of false positives\n")
    f.write("  • Recall: Cost of false negatives\n")
    f.write("  • Training Time: Computational efficiency\n")
    f.write("  • Model Interpretability: Clinical acceptability\n\n")
    
    # 3.6 Tools and Technologies
    f.write("3.6 TOOLS AND IMPLEMENTATION ENVIRONMENT\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("Programming Language:\n")
    f.write("  • Python 3.10\n\n")
    
    f.write("Core Libraries:\n")
    f.write("  • scikit-learn 1.3.2: Machine learning algorithms\n")
    f.write("  • pandas 2.0.3: Data manipulation\n")
    f.write("  • NumPy 1.24.3: Numerical computations\n")
    f.write("  • Matplotlib 3.7.2: Visualization\n")
    f.write("  • Seaborn 0.12.2: Statistical visualization\n\n")
    
    f.write("Development Environment:\n")
    f.write("  • Jupyter Notebook: Interactive development\n")
    f.write("  • Operating System: Cross-platform compatible\n\n")
    
    f.write("Hardware Requirements:\n")
    f.write("  • Minimum: 4GB RAM, dual-core processor\n")
    f.write("  • Recommended: 8GB RAM, quad-core processor\n\n")
    
    f.write("=" * 80 + "\n")
    f.write("END OF METHODOLOGY DOCUMENTATION\n")
    f.write("=" * 80 + "\n")

print(f"\n✓ Methodology documentation generated: {methodology_path}")
print(f"  File size: {os.path.getsize(methodology_path) / 1024:.2f} KB")


GENERATING METHODOLOGY DOCUMENTATION

✓ Methodology documentation generated: results/METHODOLOGY.txt
  File size: 9.77 KB


## Result Summary for Presentation

In [7]:
print("\n" + "=" * 70)
print("GENERATING PRESENTATION SUMMARY")
print("=" * 70)

presentation_path = 'results/PRESENTATION_SUMMARY.txt'

with open(presentation_path, 'w', encoding='utf-8') as f:
    f.write("=" * 80 + "\n")
    f.write("PRESENTATION SUMMARY\n")
    f.write("Quick Reference for Project Defense\n")
    f.write("=" * 80 + "\n\n")
    
    f.write("SLIDE 1: TITLE\n")
    f.write("-" * 80 + "\n")
    f.write("Development of an Intelligent System for Diabetes Classification\n")
    f.write("Using Machine Learning: A Comparative Analysis\n\n")
    f.write("By: UGOARU KINGSLEY NWACHUKWU (20201249402)\n")
    f.write("Supervisor: Mrs Vivian Mbamala\n\n")
    
    f.write("SLIDE 2: PROBLEM STATEMENT\n")
    f.write("-" * 80 + "\n")
    f.write("• Diabetes affects millions globally, including Nigeria\n")
    f.write("• Many cases undiagnosed until complications arise\n")
    f.write("• Traditional screening is time-consuming and resource-intensive\n")
    f.write("• Need for automated, efficient screening tools\n\n")
    
    f.write("SLIDE 3: OBJECTIVES\n")
    f.write("-" * 80 + "\n")
    f.write("1. Develop three ML algorithms for diabetes classification\n")
    f.write("2. Compare performance of Naïve Bayes, SVM, and Decision Tree\n")
    f.write("3. Identify the most effective algorithm\n")
    f.write("4. Provide recommendations for clinical implementation\n\n")
    
    f.write("SLIDE 4: METHODOLOGY\n")
    f.write("-" * 80 + "\n")
    f.write("Dataset: 768 patients, 8 clinical features\n")
    f.write("Process:\n")
    f.write("  ① Data Collection → ② Preprocessing → ③ Model Training →\n")
    f.write("  ④ Hyperparameter Optimization → ⑤ Evaluation → ⑥ Comparison\n\n")
    
    f.write("SLIDE 5: DATA PREPROCESSING\n")
    f.write("-" * 80 + "\n")
    f.write("• Handled missing values (median imputation)\n")
    f.write("• Split data: 80% training, 20% testing\n")
    f.write("• Feature scaling (StandardScaler)\n")
    f.write("• Result: Clean, normalized dataset ready for ML\n\n")
    
    f.write("SLIDE 6: MODELS DEVELOPED\n")
    f.write("-" * 80 + "\n")
    f.write("1. Naïve Bayes: Fast, probabilistic\n")
    f.write("2. SVM: Margin-based, 48 hyperparameter combinations tested\n")
    f.write("3. Decision Tree: Rule-based, 160 hyperparameter combinations tested\n\n")
    
    f.write("SLIDE 7: RESULTS - PERFORMANCE COMPARISON\n")
    f.write("-" * 80 + "\n")
    f.write(f"{'Model':<15} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}\n")
    f.write("-" * 80 + "\n")
    for _, row in evaluation_results.iterrows():
        f.write(f"{row['Model']:<15} "
                f"{row['Accuracy']*100:>10.2f}% "
                f"{row['Precision']:>11.4f} "
                f"{row['Recall']:>11.4f} "
                f"{row['F1-Score']:>11.4f}\n")
    f.write("\n")
    
    f.write("SLIDE 8: BEST MODEL\n")
    f.write("-" * 80 + "\n")
    f.write(f"🏆 Winner: {best_model}\n\n")
    f.write(f"Performance:\n")
    f.write(f"  • Accuracy: {best_row['Accuracy']*100:.2f}%\n")
    f.write(f"  • F1-Score: {best_row['F1-Score']:.4f}\n")
    f.write(f"  • Correctly identified: {best_row['True Positives']} diabetic patients\n")
    f.write(f"  • Missed: {best_row['False Negatives']} diabetic patients\n\n")
    
    f.write("Why This Model?\n")
    f.write("  • Best balance between precision and recall\n")
    f.write("  • Minimizes both false positives and false negatives\n")
    f.write("  • Clinically significant performance\n\n")
    
    f.write("SLIDE 9: KEY FINDINGS\n")
    f.write("-" * 80 + "\n")
    f.write("✓ All three algorithms achieved >74% accuracy\n")
    f.write(f"✓ {best_model} outperformed others\n")
    f.write("✓ Machine learning is viable for diabetes screening\n")
    f.write("✓ System can support healthcare workers\n\n")
    
    f.write("SLIDE 10: VISUALIZATIONS TO SHOW\n")
    f.write("-" * 80 + "\n")
    f.write("• Confusion matrices (all three models side-by-side)\n")
    f.write("• Performance comparison bar chart\n")
    f.write("• ROC curves\n")
    f.write("• Performance dashboard (4-panel summary)\n\n")
    
    f.write("SLIDE 11: LIMITATIONS\n")
    f.write("-" * 80 + "\n")
    f.write("• Dataset from Pima Indian population (may not generalize to Nigerians)\n")
    f.write("• Limited sample size (768 patients)\n")
    f.write("• No real-world clinical validation yet\n")
    f.write("• Binary classification only\n\n")
    
    f.write("SLIDE 12: RECOMMENDATIONS\n")
    f.write("-" * 80 + "\n")
    f.write("For Implementation:\n")
    f.write("  • Collect local Nigerian patient data\n")
    f.write("  • Conduct clinical trials\n")
    f.write("  • Develop user-friendly interface\n")
    f.write("  • Train healthcare workers\n\n")
    
    f.write("For Research:\n")
    f.write("  • Explore ensemble methods\n")
    f.write("  • Include temporal monitoring\n")
    f.write("  • Distinguish diabetes types\n\n")
    
    f.write("SLIDE 13: CONCLUSION\n")
    f.write("-" * 80 + "\n")
    f.write(f"• Successfully compared three ML algorithms\n")
    f.write(f"• {best_model} achieved best performance ({best_row['Accuracy']*100:.2f}% accuracy)\n")
    f.write(f"• Demonstrates potential for clinical support tools\n")
    f.write(f"• Contributes to diabetes management in Nigeria\n\n")
    
    f.write("SLIDE 14: QUESTIONS & ANSWERS\n")
    f.write("-" * 80 + "\n")
    f.write("Be prepared to answer:\n")
    f.write("  1. Why these three algorithms?\n")
    f.write("  2. How does hyperparameter tuning work?\n")
    f.write("  3. What is F1-Score and why is it important?\n")
    f.write("  4. How would you deploy this in a Nigerian hospital?\n")
    f.write("  5. What are the ethical considerations?\n")
    f.write("  6. How do you handle data privacy?\n")
    f.write("  7. Can this work with other diseases?\n\n")
    
    f.write("KEY STATISTICS TO REMEMBER:\n")
    f.write("-" * 80 + "\n")
    f.write(f"• Dataset: 768 patients, 8 features\n")
    f.write(f"• Best Model: {best_model}\n")
    f.write(f"• Best Accuracy: {best_row['Accuracy']*100:.2f}%\n")
    f.write(f"• Best F1-Score: {best_row['F1-Score']:.4f}\n")
    f.write(f"• Training/Test Split: 80/20\n")
    f.write(f"• Validation Method: 5-fold cross-validation\n\n")
    
    f.write("=" * 80 + "\n")
    f.write("END OF PRESENTATION SUMMARY\n")
    f.write("=" * 80 + "\n")

print(f"\n✓ Presentation summary generated: {presentation_path}")
print(f"  File size: {os.path.getsize(presentation_path) / 1024:.2f} KB")


GENERATING PRESENTATION SUMMARY

✓ Presentation summary generated: results/PRESENTATION_SUMMARY.txt
  File size: 5.39 KB
