In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib import transforms
import scipy
import scipy.stats as st
from SequenceGenerator import MultiSequenceGenerator

In [None]:
np.random.power(1.75, (11, 3)).shape

# zipf distribution

###### parameters

In [None]:
a = 2
size = 100

In [None]:
zipf_seq = np.random.zipf(a, size)

### mean, std and var

In [None]:
print(f'std is: {np.std(zipf_seq)}')
print(f'mean is: {np.mean(zipf_seq)}')
print(f'var is: {np.var(zipf_seq)}')

In [None]:
sns.displot(zipf_seq, kde=False, stat='probability')
plt.show()

sns.displot(zipf_seq, kde=True, stat='probability')
plt.show()

# comparing zipf law for different parameter

In [None]:
a_list = [1.3, 1.4, 1.5]
size = 2000

In [None]:
zipf_gen = MultiSequenceGenerator(generating_function=np.random.zipf, parameter_list=a_list, column_name='a')

In [None]:
zipf_gen.get_dataframe()

### mean, std and var

In [None]:
zipf_gen.get_stats()

### plotting parameter effect

In [None]:
zipf_gen.displot_kde()
zipf_gen.kde_plot()
zipf_gen.column_by_column_distplot()

### plotting parameter effect on stats

In [None]:
zipf_gen.draw_stat_plots()

# PDF

###### parameter

In [None]:
a = 4.0
size = 20000

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])

plt.bar(k, np.array(count[1:])/size, alpha=0.5, label='sample count')
plt.plot(k, expected_prob, 'k.-', alpha=0.5, label='expected count')   
plt.grid(alpha=0.4)
plt.legend()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('probability density function')
plt.show()

### semiology plot (y scale is log)

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])

plt.bar(k, np.array(count[1:])/size, alpha=0.5, label='sample count')
plt.plot(k, expected_prob, 'k.-', alpha=0.5, label='expected count')   
plt.grid(alpha=0.4)
plt.semilogy()
plt.legend()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('probability density function')
plt.show()

### log-log plot

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])

plt.bar(k, np.array(count[1:])/size, alpha=0.5, label='sample count')
plt.plot(k, expected_prob, 'k.-', alpha=0.5, label='expected count')   
plt.grid(alpha=0.4)
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('probability density function')
plt.show()

# CDF

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])
cdf_prob = np.cumsum(expected_prob)

plt.plot(k, cdf_prob, 'k.-', alpha=0.5, label='cumulative')   
plt.grid(alpha=0.4)
plt.legend()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('cumulative distribution function')
plt.show()

### semiology

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])
cdf_prob = np.cumsum(expected_prob)

plt.plot(k, cdf_prob, 'k.-', alpha=0.5, label='cumulative')   
plt.grid(alpha=0.4)
plt.legend()
plt.semilogy()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('cumulative distribution function')
plt.show()

### log-log plot

In [None]:
s = np.random.zipf(a, size)

count = np.bincount(s)
k = np.arange(1, s.max() + 1)

expected = size*(k**-a)/scipy.special.zeta(a)
expected_prob = np.array([i/size for i in expected])
cdf_prob = np.cumsum(expected_prob)

plt.plot(k, cdf_prob, 'k.-', alpha=0.5, label='cumulative')   
plt.grid(alpha=0.4)
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.title(f'Zipf sample, a={a}, size={size}')
plt.xlabel('x')
plt.ylabel('cumulative distribution function')
plt.show()