In [161]:
import pandas as pd
import os
import numpy as np
DATA_DIR = 'data/'
pd.set_option('display.max_colwidth', None) #Print full text
pd.set_option('display.max_rows', 1000) #Print full text

In [None]:
df = pd.read_csv(DATA_DIR+'census_reduced.csv')

In [43]:
vigne_term = ['vin','vigne','tonnelier','encaveur','viti']
vine_related_data = df[df['chef_vocation'].str.contains('|'.join(vigne_term))].copy()

In [44]:
clean_data = {
   "md de vin" : "marchand de vin",
    "vendeur de vin" : "marchand de vin",
    "m de vin" : "marchand de vin",
    "m de vins" : "marchand de vin",
    "md de vins" : "marchand de vin",
    "marchand de vine": "marchand de vin",
    "marchand de vins": "marchand de vin",
    "marchand vin": "marchand de vin",
    "marchand|de vin": "marchand de vin",
    "me de vins": "marchand de vin",
    "md de vine": "marchand de vin",
    "marchand|de vins" : "marchand de vin",
    "vend de vin" : "marchand de vin",
    
    "vigneron|vigneron" : "vigneron",
    "m tonnelier" : "tonnelier"
}

clean_data_fun = lambda voc: clean_data.get(voc, voc)

In [137]:
vine_related_data["chef_vocation"] = vine_related_data.chef_vocation.apply(clean_data_fun)

In [138]:
most_common_vine_related_vocation = set(vine_related_data.groupby("chef_vocation").size().sort_values(ascending=False).head(5).index)

In [139]:
most_common_vine_related_vocation

{'marchand de vin', 'tonnelier', 'vigneron', 'vigneronne', 'vintier'}

In [140]:
data = df[df['chef_vocation'].isin(most_common_vine_related_vocation)].groupby(['chef_vocation'])['annee'].apply(list)

In [141]:
data.index

Index(['marchand de vin', 'tonnelier', 'vigneron', 'vigneronne', 'vintier'], dtype='object', name='chef_vocation')

In [142]:
from sklearn.feature_extraction.text import CountVectorizer


In [143]:
years = [str(y) for y in sorted(df["annee"].unique())]


In [153]:
def dummy(doc):
    return doc
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, vocabulary=sorted(df["annee"].unique())) 

In [162]:
X = np.array(vectorizer.fit_transform(data).todense())

In [163]:
X

array([[  3,   2,   0,   0,   0,   1,   1,   1,   2,   1,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   1,   1,   0,   0,   0,   1,   0,   0,   0,   0,
          1,   0,   0,   0,   0,   0,   1,   0,   0,   1,   0,   0,   0,
          0,   0,   0,   0,   1,   0,   1,   0,   0,   0,   0,   0,   0,
          0],
       [  8,  14,   9,  11,  13,  11,  13,  12,  10,  13,  11,   9,  11,
          2,  15,   8,   0,   0,   0,   6,   0,   0,   0,   0,   0,   0,
          9,   9,  12,  11,  15,  10,  12,  15,  19,  15,  16,  11,  13,
         15,  10,  13,   1,  14,  11,  13,   6,   3,   4,  16,  14,  15,
         17,  15,  14,  12,  16,  15,  17,  18,  13,   9,  17,  16,  12,
         14],
       [163, 109,  92, 121, 143, 115, 116, 114, 110, 108,  89, 101,  83,
         41,  79,  86,   0,   0,  11,  25,   0,   0,   3,   0,   0,   1,
         71,  77,  80,  73,  78,  78,  70,  62,  72,  72,  61,  61,  68,
         57,  53,  50, 

In [164]:
vocations = list(data.index)
source = {'années' : years}
for i, voc in enumerate(vocations):
    source[voc] = X[i]

In [165]:
source

{'années': ['1810',
  '1832',
  '1835',
  '1836',
  '1837',
  '1838',
  '1839',
  '1840',
  '1841',
  '1842',
  '1843',
  '1844',
  '1845',
  '1846',
  '1847',
  '1848',
  '1849',
  '1850',
  '1851',
  '1852',
  '1853',
  '1854',
  '1855',
  '1856',
  '1857',
  '1858',
  '1859',
  '1860',
  '1861',
  '1862',
  '1863',
  '1864',
  '1865',
  '1866',
  '1867',
  '1868',
  '1869',
  '1870',
  '1871',
  '1872',
  '1873',
  '1874',
  '1875',
  '1876',
  '1877',
  '1878',
  '1879',
  '1880',
  '1881',
  '1882',
  '1883',
  '1884',
  '1885',
  '1886',
  '1887',
  '1888',
  '1889',
  '1890',
  '1891',
  '1892',
  '1893',
  '1894',
  '1895',
  '1896',
  '1897',
  '1898'],
 'marchand de vin': array([3, 2, 0, 0, 0, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 'tonnelier': array([ 8, 14,  9, 11, 13, 11, 13, 12, 10, 13, 11,  9, 11,  2, 15,  8,  

In [168]:
from bokeh.io import output_file, show
from bokeh.plotting import figure

output_file("stacked.html")

colors = ["#c9d9d3", "#718dbf", "#e84d60", "#c9ddd3", "#e8ff60"]

p = figure(x_range=years, height=500, title="Vocation par année",
           toolbar_location=None, tools="")

p.vbar_stack(vocations, x='années', width=0.9, color=colors, source=source,
             legend_label=vocations)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

show(p)

In [169]:
from bokeh.io import output_file, show
from bokeh.plotting import figure

output_file("stacked.html")

colors = ["#c9d9d3", "#718dbf", "#e84d60", "#c9ddd3", "#e8ff60"]

p = figure(x_axis_type = "datetime", title = "Glucose Range", plot_height = 350, plot_width = 800)


for i, voc in enumerate(vocations):
    p.line(source[voc], X[i])



p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

show(p)

You are attempting to set `plot.legend.location` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

You are attempting to set `plot.legend.orientation` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

