# Data Collection Analysis

This notebook analyzes the Python functions collected from GitHub repositories.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
try:
    df = pd.read_csv('../data/raw/github_functions.csv')
    print(f"Total functions collected: {len(df)}")
    print(f"Total repositories: {df['repo_name'].nunique()}")
    display(df.head())
except FileNotFoundError:
    print("Data file not found. Run the collector script first.")

## Function Length Distribution

In [None]:
if 'df' in locals():
    plt.figure(figsize=(10, 6))
    sns.histplot(df['num_lines'], bins=50, kde=True)
    plt.title('Distribution of Function Lengths')
    plt.xlabel('Number of Lines')
    plt.ylabel('Count')
    plt.show()

## Most Common Function Names

In [None]:
if 'df' in locals():
    top_names = df['function_name'].value_counts().head(20)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_names.values, y=top_names.index)
    plt.title('Top 20 Most Common Function Names')
    plt.xlabel('Count')
    plt.show()

## Random Samples

In [None]:
if 'df' in locals():
    sample = df.sample(5)
    for idx, row in sample.iterrows():
        print(f"--- {row['function_name']} (from {row['repo_name']}) ---")
        print(row['code'])
        print("\n" + "="*50 + "\n")