In [95]:
from scipy import stats
import numpy as np

## Summary

* read sample or population data from a feature column of X (later being used in ML)
* build the probability distribution and the RV
* make it persistent (pickle)
* use the persisted RV to test or experiment with different ML algorithms, preprocessing, feature reduction ...

* below solutions need QA

## Discrete Probability Distribution

### Scipy - rv_discrete

In [116]:
def rv_discrete_dist(sample):
    counts, indices = np.histogram(sample, bins=range(sample.min(), sample.max()+1))
    counts = counts.astype(np.float64)
    weights = counts / np.sum(counts)

    non_zero_prob = weights > 0.0
    return (indices[non_zero_prob], weights[non_zero_prob]) 

In [117]:
sample = np.random.randint(0, 48, 1000)

distrib = stats.rv_discrete(values=rv_discrete_dist(sample))

In [118]:
distrib.rvs(size=10)

array([ 2, 22, 34, 31, 36, 29, 13,  1,  5, 31])

### Numpy - Random Choice

In [107]:
def rdf_discrete_from_hist(sample):
    values, indices = np.histogram(sample, bins=range(len(sample)))
    values = values.astype(np.float64)
    weights = values / np.sum(values)

    def rvs(sample_size):
        return np.random.choice(indices[1:], sample_size, p=weights)
    
    return rvs

In [109]:
data = np.random.randint(0, 48, 1000)

rv = rdf_discrete_from_hist(data)
rv(10)

array([ 1, 29, 32, 33, 25, 19, 29, 37, 47, 15])

#### fixed bins

In [101]:
def rdf_bins_from_hist(sample):
    values, indices = np.histogram(sample, bins=20)
    values = values.astype(np.float64)
    weights = values / np.sum(values)

    def rvs(sample_size):
        return np.random.choice(indices[1:], sample_size, p=weights)
    
    return rvs

In [102]:
data = np.random.rand(1000)

rv = rdf_bins_from_hist(data)
print rv(10)

[ 0.50095806  0.75028735  0.30149463  0.15189705  0.99961664  0.25162877
  0.1020312   0.94975078  0.40122634  0.85001907]
