In [57]:
import os
import re
from collections import defaultdict, Counter
from datetime import datetime
import yaml
import math

# Paths

In [24]:
main_vault_path = r"C:\Users\nonak\Documents\Thougts"
second_valt_path = r"C:\Users\nonak\Documents\MyObsidianSetup"

daily_notes_path = r"C:\Users\nonak\Documents\Thoughts\Calendar\DAILY"



### PropertyAnalysis

In [7]:
## extract data functions
def get_file_metadata(file_path):
    """Get enhanced file metadata with relative path"""
    stat = os.stat(file_path)
    return {
        'created': datetime.fromtimestamp(stat.st_ctime),
        'modified': datetime.fromtimestamp(stat.st_mtime),
        'size_kb': stat.st_size / 1024,
        'rel_path': os.path.relpath(file_path, start=directory_to_scan)
    }

def analyze_frontmatter(directory):
    """Extract and analyze all frontmatter data with statistical insights"""
    property_stats = defaultdict(Counter)
    file_count = 0
    dates = []
    sizes = []
    property_presence = Counter()
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                file_count += 1
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                if content.startswith("---"):
                    end_of_header = content.find("---", 3)
                    if end_of_header != -1:
                        header = content[3:end_of_header].strip()
                        try:
                            metadata = get_file_metadata(file_path)
                            dates.append(metadata['created'])
                            sizes.append(metadata['size_kb'])
                            
                            data = yaml.safe_load(header) or {}
                            for prop, value in data.items():
                                property_presence[prop] += 1
                                if isinstance(value, list):
                                    for item in value:
                                        property_stats[prop][str(item)] += 1
                                else:
                                    property_stats[prop][str(value)] += 1
                        except yaml.YAMLError:
                            pass
    
    # Calculate date statistics
    date_stats = {}
    if dates:
        sorted_dates = sorted(dates)
        date_stats = {
            'oldest': min(dates),
            'newest': max(dates),
            'timespan': max(dates) - min(dates),
            'median': sorted_dates[len(sorted_dates)//2],
            'total_files': file_count,
            'files_per_day': file_count / (max(dates) - min(dates)).days if len(dates) > 1 else 0
        }
    
    # Calculate size statistics
    size_stats = {}
    if sizes:
        size_stats = {
            'total_mb': sum(sizes) / 1024,
            'avg_kb': sum(sizes) / len(sizes),
            'largest_kb': max(sizes),
            'smallest_kb': min(sizes)
        }
    
    return {
        'property_stats': dict(property_stats),
        'property_presence': property_presence,
        'date_stats': date_stats,
        'size_stats': size_stats,
        'total_files': file_count
    }


In [8]:
# Generate report and .md
def generate_statistical_report(data, output_file="markdown_stats_report.md"):
    """Generate a statistics-focused markdown report"""
    with open(output_file, 'w', encoding='utf-8') as md:
        # Report Header
        md.write("# Markdown Statistics Report\n\n")
        
        md.write(f"> **Analysis performed:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        md.write(f"> **Total files analyzed:** {data['total_files']:,}\n\n")
        
        if data['date_stats']:
            stats = data['date_stats']
            md.write(f"> **Date range:** {stats['oldest'].strftime('%Y-%m-%d')} to {stats['newest'].strftime('%Y-%m-%d')}\n")
            md.write(f"> **Timespan:** {stats['timespan'].days} days\n")
            md.write(f"> **Average files per day:** {stats['files_per_day']:.2f}\n\n")
            md.write(f"[[_index_notas|Acessar Index todas as notas]]")
        else:
            md.write("*No date information available*\n")
        
        # # Size Analysis
        # md.write("\n## 📦 Size Analysis\n")
        # if data['size_stats']:
        #     stats = data['size_stats']
        #     md.write(f"> **Total content size:** {stats['total_mb']:.2f} MB\n")
        #     md.write(f"> **Average file size:** {stats['avg_kb']:.1f} KB\n")
        #     md.write(f"> **Largest file:** {stats['largest_kb']:.1f} KB\n")
        #     md.write(f"> **Smallest file:** {stats['smallest_kb']:.1f} KB\n")
        #     md.write(f"> **Size range:** {stats['largest_kb']/stats['smallest_kb']:.1f}x variation\n")
        # else:
        #     md.write("*No size information available*\n")
        
        # Property Prevalence
        md.write("\n\n\n## Property Counts\n")
        md.write("| Property | Files | Coverage |\n")
        md.write("|----------|-------|----------|\n")
        for prop, count in data['property_presence'].most_common(15):
            coverage = (count / data['total_files']) * 100
            md.write(f"| `{prop}` | {count} | {coverage:.1f}% |\n")
        
        # Property Value Analysis
        md.write("\n## By Property\n")
        for prop, counter in data['property_stats'].items():
            total = sum(counter.values())
            unique = len(counter)
            value_counts = f"\n> **Present in:** {data['property_presence'][prop]} files ({data['property_presence'][prop]/data['total_files']:.1%})\n"
            md.write(f"\n##### `{prop}` {value_counts}")
                
            # Show value distribution if not too large
            if unique <= 15:
                md.write("\n**Value Distribution:**\n")
                for value, count in counter.most_common():
                    md.write(f"- `{value}`: {count} ({count/total:.1%})\n")
            else:
                top_values = counter.most_common(5)
                others = total - sum(count for _, count in top_values)
                md.write("\n> **Top Values:**\n")
                for value, count in top_values:
                    md.write(f"- `{value}`: {count} ({count/total:.1%})\n")
                md.write(f"\n> *Others ({unique-5} values)*: {others} ({others/total:.1%})\n")
            
            # Calculate Gini coefficient for inequality
            # if unique > 1:
            #     sorted_counts = sorted(counter.values())
            #     n = len(sorted_counts)
            #     gini = sum(abs(x-y) for x in sorted_counts for y in sorted_counts) / (2*n*sum(sorted_counts))
            #     md.write(f"> **Value inequality (Gini):** {gini:.3f} (0=equal, 1=unequal)\n")
        
        # Recommendations Section
        # md.write("\n## 🚀 Recommendations\n")
        # md.write("### Based on your metadata patterns:\n")
        
        # # Property standardization opportunities
        # common_props = [p for p, c in data['property_presence'].most_common(5) if c/data['total_files'] > 0.7]
        # if common_props:
        #     md.write("- These properties are nearly universal and could be enforced:\n")
        #     for prop in common_props:
        #         md.write(f"  - `{prop}` (in {data['property_presence'][prop]/data['total_files']:.0%} of files)\n")
        
        # # Underused properties
        # rare_props = [p for p, c in data['property_presence'].items() if 0 < c/data['total_files'] < 0.1]
        # if rare_props:
        #     md.write("\n- These properties are rarely used and might need review:\n")
        #     for prop in rare_props[:5]:
        #         md.write(f"  - `{prop}` (only {data['property_presence'][prop]} files)\n")
        
        # # High-value-diversity properties
        # diverse_props = []
        # for prop, counter in data['property_stats'].items():
        #     unique = len(counter)
        #     total = sum(counter.values())
        #     if unique > 10 and total/unique < 3:
        #         diverse_props.append((prop, unique))
        
        # if diverse_props:
        #     md.write("\n- These properties have many unique values with low repetition:\n")
        #     for prop, unique in sorted(diverse_props, key=lambda x: x[1], reverse=True)[:3]:
        #         md.write(f"  - `{prop}` ({unique} unique values)\n")
        #         values_sample = ", ".join(f"`{v}`" for v, _ in data['property_stats'][prop].most_common(3))
        #         md.write(f"    *Sample values:* {values_sample}\n")

# Configuration
directory_to_scan = r"C:\Users\nonak\Documents\Thoughts"
output_file = r"C:\Users\nonak\Documents\Thoughts\PropertyAnalysis.md"

# Run analysis and generate report
data = analyze_frontmatter(directory_to_scan)
generate_statistical_report(data, output_file)
print(f"Statistical report generated: {output_file}")

Statistical report generated: C:\Users\nonak\Documents\Thoughts\PropertyAnalysis.md


### create a index note with all notes by folder

#### Getting data

In [147]:
import os
from collections import defaultdict, Counter
from datetime import datetime
import yaml

def contar_palavras(texto):
    return len(texto.split())

def get_file_metadata(file_path, directory_to_scan):
    """Get enhanced file metadata with relative path"""
    stat = os.stat(file_path)
    return {
        'created': datetime.fromtimestamp(stat.st_ctime),
        'modified': datetime.fromtimestamp(stat.st_mtime),
        'size_kb': stat.st_size / 1024,
        'rel_path': os.path.relpath(file_path, start=directory_to_scan)
    }

def analyze_frontmatter(directory):
    """Extract and analyze all frontmatter data with statistical insights"""
    property_stats = defaultdict(Counter)
    file_count = 0
    dates = []
    sizes = []
    property_presence = Counter()
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                file_count += 1
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                if content.startswith("---"):
                    end_of_header = content.find("---", 3)
                    if end_of_header != -1:
                        header = content[3:end_of_header].strip()
                        try:
                            metadata = get_file_metadata(file_path, directory)
                            dates.append(metadata['created'])
                            sizes.append(metadata['size_kb'])
                            
                            data = yaml.safe_load(header) or {}
                            for prop, value in data.items():
                                property_presence[prop] += 1
                                if isinstance(value, list):
                                    for item in value:
                                        property_stats[prop][str(item)] += 1
                                else:
                                    property_stats[prop][str(value)] += 1
                        except yaml.YAMLError:
                            pass
    
    # Calculate date statistics
    date_stats = {}
    if dates:
        sorted_dates = sorted(dates)
        date_stats = {
            'oldest': min(dates),
            'newest': max(dates),
            'timespan': max(dates) - min(dates),
            'median': sorted_dates[len(sorted_dates)//2],
            'total_files': file_count,
            'files_per_day': file_count / (max(dates) - min(dates)).days if len(dates) > 1 else 0
        }
    
    # Calculate size statistics
    size_stats = {}
    if sizes:
        size_stats = {
            'total_mb': sum(sizes) / 1024,
            'avg_kb': sum(sizes) / len(sizes),
            'largest_kb': max(sizes),
            'smallest_kb': min(sizes)
        }
    
    return {
        'property_stats': dict(property_stats),
        'property_presence': property_presence,
        'date_stats': date_stats,
        'size_stats': size_stats,
        'total_files': file_count
    }

def listar_notas_markdown_organizadas(pasta_raiz):
    notas_por_pasta = defaultdict(list)
    contagem_palavras_por_pasta = defaultdict(int)
    total_palavras_geral = 0
    total_notas_geral = 0
    datas_modificacao = []

    for raiz, _, arquivos in os.walk(pasta_raiz):
        caminho_relativo = os.path.relpath(raiz, pasta_raiz)
        if caminho_relativo == '.':
            continue  # Ignorar a raiz

        for arquivo in arquivos:
            if arquivo.endswith(".md") and arquivo != "_index_notas.md":
                caminho_completo = os.path.join(raiz, arquivo)
                nome_nota = os.path.splitext(arquivo)[0]
                
                with open(caminho_completo, 'r', encoding='utf-8') as f:
                    conteudo = f.read()
                    palavras = contar_palavras(conteudo)
                    total_palavras_geral += palavras
                    total_notas_geral += 1
                    contagem_palavras_por_pasta[caminho_relativo] += palavras
                
                mod_time = os.path.getmtime(caminho_completo)
                datas_modificacao.append(datetime.fromtimestamp(mod_time))
                
                notas_por_pasta[caminho_relativo].append((nome_nota, palavras))

    return notas_por_pasta, contagem_palavras_por_pasta, total_palavras_geral, total_notas_geral, datas_modificacao

def formatar_numero(num, decimal_places=1):
    if isinstance(num, int):
        return f"{num:,}".replace(",", ".")
    else:
        return f"{num:,.{decimal_places}f}".replace(",", "X").replace(".", ",").replace("X", ".")

def salvar_em_markdown(notas_organizadas, contagem_por_pasta, total_geral, total_notas, datas_modificacao, frontmatter_data, caminho_saida):
    with open(caminho_saida, 'w', encoding='utf-8') as f:
        # Cabeçalho principal
        f.write(f"*Atualizado em {datetime.now().strftime('%Y/%m/%d %H:%M')}*\n\n")
        
        # Seção de resumo estatístico
        # f.write("# Informações Gerais\n\n")
        f.write(f"> **Total de Palavras**: {formatar_numero(total_geral)} palavras\n")
        f.write(f"> **Total de Notas**: {formatar_numero(total_notas)} notas\n")
        
        media_palavras = total_geral / total_notas if total_notas > 0 else 0
        f.write(f"> **Média de palavras por nota**: {formatar_numero(media_palavras)} palavras\n")
        f.write(f"> **Pastas organizadas**: {formatar_numero(len(notas_organizadas))}\n")
        
        if contagem_por_pasta:
            pasta_mais_palavras = max(contagem_por_pasta.items(), key=lambda x: x[1])
            f.write(f"> **Pasta mais densa**: `{pasta_mais_palavras[0]}` ({formatar_numero(pasta_mais_palavras[1])} palavras)\n")
        
        if notas_organizadas:
            pasta_mais_notas = max(notas_organizadas.items(), key=lambda x: len(x[1]))
            f.write(f"> **Pasta com mais notas**: `{pasta_mais_notas[0]}` ({formatar_numero(len(pasta_mais_notas[1]))} notas)\n")
        
        # Add frontmatter statistics section
        f.write("\n---\n\n# 🟦 Resumo propriedades \n\n")
        
        # if frontmatter_data['date_stats']:
        #     stats = frontmatter_data['date_stats']
        #     f.write(f"> **Date range:** {stats['oldest'].strftime('%Y-%m-%d')} to {stats['newest'].strftime('%Y-%m-%d')}\n")
        #     f.write(f"> **Timespan:** {stats['timespan'].days} days\n")
        #     f.write(f"> **Average files per day:** {stats['files_per_day']:.2f}\n\n")
        
        # Property Prevalence
        # f.write("\n# Resumo propriedades\n")
        f.write("| Property | Files | Coverage |\n")
        f.write("|----------|-------|----------|\n")
        for prop, count in frontmatter_data['property_presence'].most_common(15):
            coverage = (count / frontmatter_data['total_files']) * 100
            f.write(f"| `{prop}` | {count} | {coverage:.1f}% |\n")
        
        # Property Value Analysis
        # f.write("\n## Resumos por Propriedade\n")
        # for prop, counter in frontmatter_data['property_stats'].items():
        #     total = sum(counter.values())
        #     unique = len(counter)
        #     presence = frontmatter_data['property_presence'][prop]
        #     percentage = presence / frontmatter_data['total_files']
            
        #     f.write(f"\n#### `{prop}`\n")
        #     f.write(f"- 📁 Present in: {presence} files ({percentage:.1%})\n")
        #     f.write(f"- 🧮 Unique values: {unique}\n")
        
        #     if unique <= 15:
        #         f.write("  - 🔢 Values:\n")
        #         for value, count in counter.most_common():
        #             f.write(f"    - `{value}`: {count} ({count/total:.1%})\n")
        #     else:
        #         f.write("  - 🔝 Top values:\n")
        #         top_values = counter.most_common(5)
        #         for value, count in top_values:
        #             f.write(f"    - `{value}`: {count} ({count/total:.1%})\n")
        #         others = total - sum(count for _, count in top_values)
        #         f.write(f"    - `Others` ({unique - 5} values): {others} ({others/total:.1%})\n")

        
        f.write("\n---\n\n")
        f.write("# 🗃️ **Pastas**\n\n")
        
     
        
        # Listagem hierárquica das notas
        # Listagem hierárquica das notas
# Listagem hierárquica das notas
        pastas_escritas = set()
        
        # Ícones por nível
        icones_pasta = {
            1: "📁",
            2: "📂",
            3: "📘",
            4: "📙",
            5: "📗",
            6: "📄",
        }
        
        for caminho_completo, notas in sorted(notas_organizadas.items()):
            partes = caminho_completo.split(os.sep)
            
            caminho_acumulado = []
            for i, parte in enumerate(partes):
                caminho_acumulado.append(parte)
                chave = os.sep.join(caminho_acumulado)
                
                if chave not in pastas_escritas:
                    header_level = min(i + 1, 6)
                    icone = icones_pasta.get(header_level, "📦")
                    f.write(f"{'#' * header_level} {icone} {parte}\n\n")
                    pastas_escritas.add(chave)
        
            for nome_nota, palavras in sorted(notas, key=lambda x: x[1], reverse=True):
                f.write(f"- 📄 [{nome_nota}] — {formatar_numero(palavras)} palavras\n")
            
            total_pasta = contagem_por_pasta[caminho_completo]
            media_pasta = total_pasta / len(notas) if len(notas) > 0 else 0
            f.write(f"\n**📊 Estatísticas da pasta**:\n")
            f.write(f"- Total: {formatar_numero(total_pasta)} palavras\n")
            f.write(f"- Média por nota: {formatar_numero(media_pasta)} palavras\n")
            f.write(f"- Número de notas: {formatar_numero(len(notas))}\n\n")

        
        f.write("---\n")
        f.write(f"## 📊 Total geral: {formatar_numero(total_geral)} palavras em {formatar_numero(total_notas)} notas\n")
        f.write(f"## 📈 Média geral: {formatar_numero(total_geral/total_notas if total_notas > 0 else 0)} palavras por nota\n")

# Caminhos
caminho_da_pasta = r"C:\Users\nonak\Documents\Thoughts"
caminho_arquivo_saida = os.path.join(caminho_da_pasta, r"System\index_notas.md")

# Execução
notas_organizadas, contagem_por_pasta, total_palavras, total_notas, datas_modificacao = listar_notas_markdown_organizadas(caminho_da_pasta)
frontmatter_data = analyze_frontmatter(caminho_da_pasta)
salvar_em_markdown(notas_organizadas, contagem_por_pasta, total_palavras, total_notas, datas_modificacao, frontmatter_data, caminho_arquivo_saida)

print(f"Arquivo salvo em: {caminho_arquivo_saida}")

Arquivo salvo em: C:\Users\nonak\Documents\Thoughts\System\index_notas.md
