Graphs for visualization of the distribution of classifications generated by the antiviruses 

In [25]:
import csv
import matplotlib.pyplot as plt
import os
from collections import Counter
from adjustText import adjust_text

def visualize_categories(input_csv):
    # Initialize counters
    category_counts = Counter()
    total_samples = 0
    
    # Data for distribution graphs - Added spam_counts
    phishing_counts = []
    trojan_counts = []
    spam_counts = []
    other_counts = []
    none_counts = []
    
    # Read the CSV
    try:
        with open(input_csv, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                total_samples += 1
                category_counts[row['Category']] += 1
                
                # Collect counts for distribution - Added Spam_Count
                phishing_counts.append(int(row['Phishing_Count']))
                trojan_counts.append(int(row['Trojan_Count']))
                spam_counts.append(int(row['Spam_Count']))
                other_counts.append(int(row['Other_Count']))
                none_counts.append(int(row['None_Count']))
    
        # Print totals and percentages
        print("\nCategory Totals and Percentages:")
        print("-" * 30)
        for category, count in category_counts.items():
            percentage = (count / total_samples) * 100
            print(f"{category}: {count} ({percentage:.2f}%)")
        print(f"Total samples: {total_samples}")
        
        # Create visualizations
        # 1. Pie chart for category distribution with labels
        def autopct_format(pct):
            return f'{pct:.1f}%' if pct > 1 else ''
        
        plt.figure(figsize=(10, 6))
        
        explode = [0.1 if value/total_samples < 0.05 else 0 for value in category_counts.values()]

        wedges, texts, autotexts = plt.pie(
            category_counts.values(), 
            labels=None,  # Remove direct labels
            autopct=autopct_format, 
            startangle=90,
            explode=explode
        )

        for autotext in autotexts:
            autotext.set_fontsize(10)
            autotext.set_bbox(dict(facecolor='white', edgecolor='none', alpha=0.6))
        # Customize label appearance
        texts = []
        for autotext in autotexts:
            texts.append(autotext)
        
        adjust_text(texts)

        labels = [f"{key} ({value/total_samples:.1%})" for key, value in category_counts.items()]
        
        # Add a legend
        plt.legend(wedges, labels, title="Categories", loc="best", fontsize=12,bbox_to_anchor=(0.5, 0., 0.5, 0.5), frameon=False)

        # Title and formatting
        plt.title('Distribution of Sample Categories')
        plt.axis('equal')  # Ensures the pie chart is circular
        plt.savefig('category_pie_chart.png')
        plt.close()
                
        # 2. Bar chart comparing total counts - Added Spam
        plt.figure(figsize=(12, 8))
        categories = ['Phishing', 'Trojan', 'Spam', 'Other', 'None']
        total_counts = [
            sum(phishing_counts),
            sum(trojan_counts),
            sum(spam_counts),
            sum(other_counts),
            sum(none_counts)
        ]
        bars = plt.bar(categories, total_counts, color=['blue', 'red', 'orange', 'green', 'gray'])
        plt.title('Total Detection Counts by Type')
        plt.xlabel('Detection Type')
        plt.ylabel('Total Count')
        
        # Add count labels on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}', ha='center', va='bottom')
        
        plt.savefig('count_comparison.png')
        plt.close()
        
        # 3. Bar chart for category totals
        plt.figure(figsize=(10, 6))
        categories = list(category_counts.keys())
        counts = list(category_counts.values())
        colors = ['blue', 'red', 'orange', 'purple', 'green', 'gray'][:len(categories)]
        bars = plt.bar(categories, counts, color=colors)
        plt.title('Total Samples by Category')
        plt.xlabel('Category')
        plt.ylabel('Number of Samples')
        
        # Add count labels on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}', ha='center', va='bottom')
        
        plt.savefig('category_totals_bar.png')
        plt.close()
        
        print("\nGenerated visualizations:")
        print("- category_pie_chart.png (Pie chart of category distribution with labels)")
        print("- count_comparison.png (Bar chart comparing total detection counts)")
        print("- category_totals_bar.png (Bar chart of category totals)")
        
    except FileNotFoundError:
        print(f"Error: Input CSV file not found: {input_csv}")
    except Exception as e:
        print(f"Error processing CSV file: {str(e)}")

def main():
    # Configuration - adjust this path as needed
    input_csv = './data_by_file/sample_categories.csv'  # The CSV from the previous script
    
    print(f"Analyzing data from: {input_csv}")
    visualize_categories(input_csv)
    print("Visualization generation complete")

if __name__ == "__main__":
    main()

Analyzing data from: ./data_by_file/sample_categories.csv

Category Totals and Percentages:
------------------------------
Phishing: 3575 (95.97%)
Multiple: 85 (2.28%)
Trojan: 64 (1.72%)
Spam: 1 (0.03%)
Total samples: 3725

Generated visualizations:
- category_pie_chart.png (Pie chart of category distribution with labels)
- count_comparison.png (Bar chart comparing total detection counts)
- category_totals_bar.png (Bar chart of category totals)
Visualization generation complete
