In [1]:
import os
import csv
from matplotlib import pyplot as plt

In [2]:
!rm -rf ../pre/zipf/*

In [3]:
directory = "../pre/zipf"

os.makedirs(directory, exist_ok=True)

In [4]:
filepath = "../pre/global/counts.csv"

data = []
with open(filepath) as f:
    reader = csv.reader(f)
    
    # Skip the first line
    next(reader)

    data = [[row[0], int(row[1])] for row in reader]

In [None]:
alpha = 1.4
beta = 11.3

word_count = sum([x[1] for x in data])

# Plots the frequency
plt.loglog([x[1] / word_count for x in data])
# Plots an ideal curve 
plt.plot([x for x in range(0, len(data))], [1/pow(x + beta, alpha) for x in range(0, len(data))])
plt.show()
plt.savefig(f"{directory}/zipf_log.png")

In [None]:
plt.plot([x[1] / word_count for x in data])
plt.show()

In [7]:
def luhn_cut(data, lower, upper):
    lower_cut = len(data) * lower
    upper_cut = len(data) * upper

    return data[int(lower_cut):int(upper_cut)]

def luhn_cut_explore(data, lower, upper, save=True):
    data_cut = luhn_cut(data, lower, upper)

    word_sum = sum([x[1] for x in data_cut])
    frequency = word_sum / sum([x[1] for x in data])
    word_percentage = len(data_cut) / len(data)

    print(f"Word percentage: {(word_percentage * 100):.2f}%")
    print(f"Frequency: {(frequency * 100):.2f}%")

    plt.loglog([x[1] / word_sum for x in data_cut])
    plt.show()

    if save:
        plt.savefig(f"{directory}/zipf_log_{lower}_{upper}.png")

    plt.plot([x[1] / word_sum for x in data_cut])
    plt.show()

    if save:
        plt.savefig(f"{directory}/zipf_{lower}_{upper}.png")


## Balanced cuts

### 1%

In [None]:
luhn_cut_explore(data, 0.01, 0.99)

### 5%

In [None]:
luhn_cut_explore(data, 0.05, 0.95)

### 10%

In [None]:
luhn_cut_explore(data, 0.1, 0.9)

### 15%

In [None]:
luhn_cut_explore(data, 0.15, 0.85)

### 20%

In [None]:
luhn_cut_explore(data, 0.2, 0.8)

### 25%

In [None]:
luhn_cut_explore(data, 0.25, 0.75)

## Unbalenced cuts

### 1%-0%

In [None]:
luhn_cut_explore(data, 0.01, 1.0)

### 0%-1%

In [None]:
luhn_cut_explore(data, 0.0, 0.99)

### 1%-5%

In [None]:
luhn_cut_explore(data, 0.01, 0.95)

### 5%-1%

In [None]:
luhn_cut_explore(data, 0.05, 0.99)

### 5%-10%

In [None]:
luhn_cut_explore(data, 0.05, 0.9)

### 10%-5%

In [None]:
luhn_cut_explore(data, 0.1, 0.95)

### 10%-20%

In [None]:
luhn_cut_explore(data, 0.1, 0.8)

## Extreme cuts

### 1%-30%

In [None]:
luhn_cut_explore(data, 0.01, 0.7)

### 5%-30%

In [None]:
luhn_cut_explore(data, 0.05, 0.7)

### 10%-30%

In [None]:
luhn_cut_explore(data, 0.1, 0.7)

### 1%-40%

In [None]:
luhn_cut_explore(data, 0.01, 0.6)

### 5%-40%

In [None]:
luhn_cut_explore(data, 0.05, 0.6)

### 10%-40%

In [None]:
luhn_cut_explore(data, 0.1, 0.6)

### 1%-50%

In [None]:
luhn_cut_explore(data, 0.01, 0.5)

### 5%-50%

In [None]:
luhn_cut_explore(data, 0.05, 0.5)

### 10%-50%

In [None]:
luhn_cut_explore(data, 0.1, 0.5)