In [1]:
import datetime
import mysql.connector
import operator
import numpy as np
from scipy import special

import matplotlib as mlp
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import matplotlib.dates as mdates 

import plotly as py
import plotly.graph_objs as go
import plotly.express as px

import ipywidgets as widgets

import pandas as pd
import datetime
import seaborn as sb

In [2]:
sb.set()

In [3]:
py.offline.init_notebook_mode(connected = True)

### Time Series Plot of 4-aminopyridine (Metabolite_id = 1175)


In [4]:
cnx = mysql.connector.connect(user='root', password='root',
                              host='127.0.0.1',
                              database='bioeco')

sql_query = pd.read_sql_query('''select metabolite_id, 
                                        metab_name,
                                        provider_name, 
                                        date, 
                                        max(price_per_g) as maximo, 
                                        avg(price_per_g) as media, 
                                        min(price_per_g) as minimo, 
                                        count(price) as num_price
                                        from (
                                                select bioanalysis_metabolite_price.metabolite_id,
                                                       bioanalysis_metabolite.common_name                    as metab_name,
                                                       bioanalysis_metabolite_price.date, 
                                                       bioanalysis_metabolite_price.price                    as price, 
                                                       bioanalysis_metabolite_price.amount                   as amount, 
                                                       bioanalysis_metabolite_price.unity                    as unity, 
                                                       if(provider.name is null, "None", provider.name)      as provider_name, 
                                                       price/amount                                          as price_per_unity,
                                                       price/amount*convert_to_g(unity)                      as price_per_g,
                                                       price/amount*convert_to_l(unity)                      as price_per_l
                                                from bioanalysis_metabolite_price
                                                JOIN provider 
                                                    on provider.id = bioanalysis_metabolite_price.provider_id
                                                JOIN bioanalysis_metabolite
                                                    on bioanalysis_metabolite_price.metabolite_id = bioanalysis_metabolite.id
                                                where (convert_to_g(unity) is not null 
                                                or convert_to_l(unity) is not null )       #da apenas os preços por grama, litro e derivados
                                        ) as metab_prices
                                        where metabolite_id= 1175
                                        group by metabolite_id, provider_name,  year(date), month(date); ''', cnx, index_col = 'date', parse_dates=True) 


df = pd.DataFrame(sql_query, columns=['metabolite_id', 'metab_name', 'provider_name', 'maximo', 'media', 'minimo', 'num_price' ])
#criar df com provider_name, max, avg, min e num_price, tudo agrupado por metabolito, provider, ano e mês

In [5]:
df

Unnamed: 0_level_0,metabolite_id,metab_name,provider_name,maximo,media,minimo,num_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-10 10:27:08,1175,4-aminopyridine,ENAMINE Ltd.,0.055000,0.009826,0.002350,14
2017-10-10 10:27:08,1175,4-aminopyridine,"Vitas-M Laboratory, Ltd.",0.038000,0.011451,0.002570,8
2017-10-10 10:27:08,1175,4-aminopyridine,"Maybridge, Ltd.",0.066880,0.014199,0.002414,14
2017-10-10 10:27:08,1175,4-aminopyridine,HTS Biochemie Innovationen,400.000000,40.007650,0.000560,10
2017-10-10 10:27:08,1175,4-aminopyridine,Tocris Bioscience,0.001000,0.001000,0.001000,1
2017-10-10 10:27:08,1175,4-aminopyridine,"Otava, Ltd.",0.024900,0.008923,0.002920,9
2017-10-10 10:27:08,1175,4-aminopyridine,BIONET/Key Organics Ltd.,0.038850,0.008941,0.000800,14
2017-10-10 10:27:08,1175,4-aminopyridine,"Alinda Chemical, Ltd.",0.015000,0.003689,0.001000,9
2017-10-10 10:27:08,1175,4-aminopyridine,TargetMol,0.002000,0.001753,0.001560,3
2017-10-10 10:27:08,1175,4-aminopyridine,Cayman Europe,4.000000,2.218750,1.225000,4


In [6]:
df = df.to_period('M')  # arredondar as datas só para mês, dispensando o dia e a hora exata

In [7]:
df

Unnamed: 0_level_0,metabolite_id,metab_name,provider_name,maximo,media,minimo,num_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10,1175,4-aminopyridine,ENAMINE Ltd.,0.055000,0.009826,0.002350,14
2017-10,1175,4-aminopyridine,"Vitas-M Laboratory, Ltd.",0.038000,0.011451,0.002570,8
2017-10,1175,4-aminopyridine,"Maybridge, Ltd.",0.066880,0.014199,0.002414,14
2017-10,1175,4-aminopyridine,HTS Biochemie Innovationen,400.000000,40.007650,0.000560,10
2017-10,1175,4-aminopyridine,Tocris Bioscience,0.001000,0.001000,0.001000,1
2017-10,1175,4-aminopyridine,"Otava, Ltd.",0.024900,0.008923,0.002920,9
2017-10,1175,4-aminopyridine,BIONET/Key Organics Ltd.,0.038850,0.008941,0.000800,14
2017-10,1175,4-aminopyridine,"Alinda Chemical, Ltd.",0.015000,0.003689,0.001000,9
2017-10,1175,4-aminopyridine,TargetMol,0.002000,0.001753,0.001560,3
2017-10,1175,4-aminopyridine,Cayman Europe,4.000000,2.218750,1.225000,4


In [8]:
df.index = df.index.to_timestamp() # passar isto para datetime64, de modo a que o algoritmos do gráfico funcione
df.index

DatetimeIndex(['2017-10-01', '2017-10-01', '2017-10-01', '2017-10-01',
               '2017-10-01', '2017-10-01', '2017-10-01', '2017-10-01',
               '2017-10-01', '2017-10-01',
               ...
               '2019-07-01', '2019-07-01', '2019-07-01', '2019-07-01',
               '2019-07-01', '2019-07-01', '2019-07-01', '2019-07-01',
               '2019-07-01', '2019-07-01'],
              dtype='datetime64[ns]', name='date', length=873, freq=None)

In [9]:
grouped_df = df.groupby(df.index)  #agrupar os valores todos por índice, ou seja, por data (agora em y-m-01 00:00:00)

for key, item in grouped_df:
    print('key:', key)
    print(grouped_df.get_group(key), "\n\n")

key: 2017-10-01 00:00:00
            metabolite_id       metab_name  \
date                                         
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-01           1175  4-aminopyridine   
2017-10-0

In [10]:
df = df.sort_index() #ordenas o índice de modo a que a data esteja ordenada e dê para observar a evolução dos preços



layout = go.Layout(
    title = 'Time Series Plot of 4-aminopyridine (Price/g)',
    yaxis = dict(
        title = 'price (USD)'
    ),
    xaxis = dict(
        title = 'date'
    )
)



opt_list = []
d_x = {}
d_y = {}
for k,g in df.groupby(df.index)['media']:           # agrupar a df por índice e a dar só os valores da média
    opt_list.append(k)                              # na lista das opções para a interação, colocar as chaves do groupby de cima, ou seja, as datas
    x = pd.to_datetime(g.index)                     # converter cada data num timestamp??  
    y = g.values                                    # y fica como sendo os valores da média
    if k not in d_x and k not in d_y:               # se a data ainda não existir no dicionários fazer o seguinte
        d_x[k] = x                                    # colocar o valor x e o valor y nos respetivos dicionários
        d_y[k] = y                                    
    elif k in d_x and k in d_y:                     # se a data já existir
        d_x[k].append(x)                              # fazer um append, isto obriga a que o valor de uma chave seja de certa forma uma lista
        d_y[k].append(y)
    else: print('There is a provider lacking on the x or y dict.')
    
    
def update_plot(signals):#, freq):
   data = []                                              # todas as linhas do gráfico
   for s in signals:                                      # cada data que está nas opções
       trace1 = go.Box(                                   # fazer um boxplot
           x = d_x[s],                                    # usar os valores de tempo para aquele tempo
           y = d_y[s],                                    # usar os valores de média para aquele tempo (que está agrupado por avg)
           name = 'time {}'.format(s),
           boxpoints='outliers',                          # faz o boxplot e ainda mostra os outliers
           marker=dict(                                     
               color='rgb(8,81,156)',
               outliercolor='rgba(219, 64, 82, 0.6)',
               line=dict(
                   outliercolor='rgba(219, 64, 82, 0.6)',
                   outlierwidth=2)),
           line_color='rgb(8,81,156)'
           
       )
       data.append(trace1)                                 # fazer o append de todas as linhas fundamentais
    
   fig = go.Figure(data = data, layout= layout)            # criar o gráfico
   py.offline.iplot(fig)

    
signals = widgets.SelectMultiple(options = opt_list, description = 'Time')
#freq = widgets.FloatSlider(min=1, max=20, value=1, description = 'Freq')
widgets.interactive(update_plot, signals = signals)#, freq = freq)

interactive(children=(SelectMultiple(description='Time', options=(Timestamp('2017-10-01 00:00:00'), Timestamp(…

### Removing Outliers - Metab 1175

#### Outliers mantém-se mas com valores da mediana

In [11]:
# #Outlier Treatment  -- os outliers ficam com o valor da mediana

# def outlier_detect(df):
#     for i in df.describe().columns:
#         Q1=df.describe().at['25%',i]
#         Q3=df.describe().at['75%',i]
#         IQR=Q3 - Q1
#         LTV=Q1 - 1.5 * IQR
#         UTV=Q3 + 1.5 * IQR
#         x=np.array(df[i])
#         p=[]
#         for j in x:
#             if j < LTV or j>UTV:
#                 p.append(df[i].median())  #atribuir o valor da mediana nos outliers
#             else:
#                 p.append(j)
#         df[i]=p
#     return df

In [12]:
# df_median = outlier_detect(df)

In [17]:
# df_median

In [14]:
# # Create Boxplot


# df_median = df_median.sort_index()



# layout = go.Layout(
#     title = 'Time Series Plot of 4-aminopyridine (Price/g)',
#     yaxis = dict(
#         title = 'price (USD)'
#     ),
#     xaxis = dict(
#         title = 'date'
#     )
# )



# opt_list = []
# d_x = {}
# d_y = {}
# # dates =[]   #lista com as datas exatas que existem na dataframe
# for k,g in df_median.groupby(df_median.index)['media']:
#     opt_list.append(k)
#     x = pd.to_datetime(g.index)
# #     dates.append(x)
#     y = g.values
#     if k not in d_x and k not in d_y:
#         d_x[k] = x
#         d_y[k] = y
#     elif k in d_x and k in d_y:
#         d_x[k].append(x)
#         d_y[k].append(y)
#     else: print('There is a provider lacking on the x or y dict.')
    
    
# def update_plot(signals):#, freq):
#    data = []
#    for s in signals:
#        trace1 = go.Box(
#            x = d_x[s],
#            y = d_y[s],
#            name = 'time {}'.format(s),
#            boxpoints='outliers',
#            marker=dict(
#                color='rgb(8,81,156)',
#                outliercolor='rgba(219, 64, 82, 0.6)',
#                line=dict(
#                    outliercolor='rgba(219, 64, 82, 0.6)',
#                    outlierwidth=2)),
#            line_color='rgb(8,81,156)'
           
#        )
#        data.append(trace1)
    
#    fig = go.Figure(data = data, layout= layout)
#    py.offline.iplot(fig)

    
# signals = widgets.SelectMultiple(options = opt_list, description = 'Time')
# #freq = widgets.FloatSlider(min=1, max=20, value=1, description = 'Freq')
# widgets.interactive(update_plot, signals = signals)#, freq = freq)

interactive(children=(SelectMultiple(description='Time', options=(Timestamp('2017-10-01 00:00:00'), Timestamp(…

In [15]:
# grouped_df_median = df_median.groupby('provider_name')

# for key, item in grouped_df_median:
#     print('key:', key)
#     print(grouped_df_median.get_group(key), "\n\n")

key: AA BLOCKS
            metabolite_id       metab_name provider_name  maximo     media  \
date                                                                         
2019-06-01           1175  4-aminopyridine     AA BLOCKS    17.0  8.893333   
2019-07-01           1175  4-aminopyridine     AA BLOCKS    17.0  8.893333   

            minimo  num_price  
date                           
2019-06-01    3.08        3.0  
2019-07-01    3.08        3.0   


key: ACC Corporation
            metabolite_id       metab_name    provider_name   maximo   media  \
date                                                                           
2017-10-01           1175  4-aminopyridine  ACC Corporation  22.4158  3.3325   
2018-01-01           1175  4-aminopyridine  ACC Corporation   4.6000  3.3325   
2018-02-01           1175  4-aminopyridine  ACC Corporation   4.6000  3.3325   
2018-03-01           1175  4-aminopyridine  ACC Corporation   4.6000  3.3325   
2018-04-01           1175  4-aminopyridi

2019-07-01  1.973333   0.830        6.0   


key: Bide Pharmatech Ltd.
            metabolite_id       metab_name         provider_name  maximo  \
date                                                                       
2018-01-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-02-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-03-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-04-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-05-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-06-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   
2018-07-01           1175  4-aminopyridine  Bide Pharmatech Ltd.     3.0   

            media  minimo  num_price  
date                                  
2018-01-01   2.25     1.5        4.0  
2018-02-01   2.25     1.5        4.0  
2018-03-01   2.25     1.5        4.0  
2018-04-01   2.25     1.5        4.0  
2018-05-01

2019-07-01  0.00003        4.0   


key: Princeton Bio (2 weeks)
            metabolite_id       metab_name            provider_name  maximo  \
date                                                                          
2017-10-01           1175  4-aminopyridine  Princeton Bio (2 weeks)    20.0   
2018-01-01           1175  4-aminopyridine  Princeton Bio (2 weeks)    20.0   
2018-02-01           1175  4-aminopyridine  Princeton Bio (2 weeks)    20.0   

                media  minimo  num_price  
date                                      
2017-10-01  14.666667    0.68        3.0  
2018-01-01  14.666667    0.68        3.0  
2018-02-01  14.666667    0.68        3.0   


key: Pure Chemistry Scientific Inc.
            metabolite_id       metab_name                   provider_name  \
date                                                                         
2017-10-01           1175  4-aminopyridine  Pure Chemistry Scientific Inc.   
2018-01-01           1175  4-aminopyridine  Pure Ch

In [16]:
# # Create time series

# df_median = df_median.sort_index()


# layout = go.Layout(
#     title = 'Time Series Plot of 4-aminopyridine (Price/g)',
#     yaxis = dict(
#         title = 'price(USD)'
#     ),
#     xaxis = dict(
#         title = 'date'
#     )
# )

# prov_list = []
# for key, item in grouped_df_median:
#     prov_list.append(key)
    

# d_x = {}
# d_y = {}
# for k,g in df_median.groupby(['provider_name'])['media']:
#     x = pd.to_datetime(g.index)
#     y = g.values
#     if k not in d_x and k not in d_y:
#         d_x[k] = x
#         d_y[k] = y
#     elif k in d_x and k in d_y:
#         d_x[k].append(x)
#         d_y[k].append(y)
#     else: print('There is a provider lacking on the x or y dict.')
    


# def update_plot(signals):#, freq):
#    data = []
#    for s in signals:
#        trace1 = go.Scatter(
#            x = d_x[s],
#            y = d_y[s],
#            mode = 'lines+markers',
#            name = 'provider {}'.format(s),
#            line = dict(
#                shape = 'spline' #smooth line
#            )
#        )
#        data.append(trace1)
    
#    fig = go.Figure(data = data, layout= layout)
#    py.offline.iplot(fig)

    
# signals = widgets.SelectMultiple(options = prov_list, description = 'Provider')
# #freq = widgets.FloatSlider(min=1, max=20, value=1, description = 'Freq')
# widgets.interactive(update_plot, signals = signals)#, freq = freq)

interactive(children=(SelectMultiple(description='Provider', options=('AA BLOCKS', 'ACC Corporation', 'AD Chem…

---------

#### Tratar dos outliers a retirar-los completamente

In [11]:
num_df = df.select_dtypes(include=["number"])
cat_df = df.select_dtypes(exclude=["number"])

In [12]:
Q1 = num_df.quantile(0.02)
Q3 = num_df.quantile(0.98)
IQR = Q3 - Q1
idx = ~((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).any(axis=1)
df_cleaned = pd.concat([num_df.loc[idx], cat_df.loc[idx]], axis=1)

In [13]:
df_cleaned

Unnamed: 0_level_0,metabolite_id,maximo,media,minimo,num_price,metab_name,provider_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-01,1175,0.055000,0.009826,0.002350,14,4-aminopyridine,ENAMINE Ltd.
2017-10-01,1175,0.038000,0.011451,0.002570,8,4-aminopyridine,"Vitas-M Laboratory, Ltd."
2017-10-01,1175,0.066880,0.014199,0.002414,14,4-aminopyridine,"Maybridge, Ltd."
2017-10-01,1175,400.000000,40.007650,0.000560,10,4-aminopyridine,HTS Biochemie Innovationen
2017-10-01,1175,0.001000,0.001000,0.001000,1,4-aminopyridine,Tocris Bioscience
2017-10-01,1175,0.024900,0.008923,0.002920,9,4-aminopyridine,"Otava, Ltd."
2017-10-01,1175,0.038850,0.008941,0.000800,14,4-aminopyridine,BIONET/Key Organics Ltd.
2017-10-01,1175,0.015000,0.003689,0.001000,9,4-aminopyridine,"Alinda Chemical, Ltd."
2017-10-01,1175,0.002000,0.001753,0.001560,3,4-aminopyridine,TargetMol
2017-10-01,1175,4.000000,2.218750,1.225000,4,4-aminopyridine,Cayman Europe


In [14]:
grouped_df = df_cleaned.groupby('provider_name')

for key, item in grouped_df:
    print('key:', key)
    print(grouped_df.get_group(key), "\n\n")

key: AA BLOCKS
            metabolite_id  maximo     media  minimo  num_price  \
date                                                             
2019-06-01           1175    17.0  8.893333    3.08          3   
2019-07-01           1175    17.0  8.893333    3.08          3   

                 metab_name provider_name  
date                                       
2019-06-01  4-aminopyridine     AA BLOCKS  
2019-07-01  4-aminopyridine     AA BLOCKS   


key: ACC Corporation
            metabolite_id    maximo     media   minimo  num_price  \
date                                                                
2017-10-01           1175   22.4158  22.41580  22.4158          1   
2018-01-01           1175  170.8700  94.04785  22.4158          4   
2018-02-01           1175  170.8700  94.04785  22.4158          4   
2018-03-01           1175  170.8700  94.04785  22.4158          4   
2018-04-01           1175  170.8700  94.04785  22.4158          4   
2018-05-01           1175  170.8700  

2019-07-01  4-aminopyridine  BLD Pharmatech Ltd.   


key: Bide Pharmatech Ltd.
            metabolite_id  maximo  media  minimo  num_price       metab_name  \
date                                                                           
2018-01-01           1175     3.0   2.25     1.5          4  4-aminopyridine   
2018-02-01           1175     3.0   2.25     1.5          4  4-aminopyridine   
2018-03-01           1175     3.0   2.25     1.5          4  4-aminopyridine   
2018-04-01           1175     3.0   2.25     1.5          4  4-aminopyridine   
2018-05-01           1175     3.0   1.98     0.9          5  4-aminopyridine   
2018-06-01           1175     3.0   1.98     0.9          5  4-aminopyridine   
2018-07-01           1175     3.0   1.98     0.9          5  4-aminopyridine   

                   provider_name  
date                              
2018-01-01  Bide Pharmatech Ltd.  
2018-02-01  Bide Pharmatech Ltd.  
2018-03-01  Bide Pharmatech Ltd.  
2018-04-01  Bide Pharmat

2018-05-01  Oxchem Corporation   


key: Pharmeks, Ltd.
            metabolite_id  maximo      media   minimo  num_price  \
date                                                               
2017-10-01           1175    25.0  12.000007  0.00003          4   
2018-01-01           1175    25.0  12.000007  0.00003          4   
2018-02-01           1175    25.0  12.000007  0.00003          4   
2018-03-01           1175    25.0  12.000007  0.00003          4   
2018-04-01           1175    25.0  12.000007  0.00003          4   
2018-05-01           1175    25.0  12.000007  0.00003          4   
2018-06-01           1175    25.0  12.000007  0.00003          4   
2018-07-01           1175    25.0  12.000007  0.00003          4   
2018-08-01           1175    25.0  12.000007  0.00003          4   
2018-09-01           1175    25.0  12.000007  0.00003          4   
2018-10-01           1175    25.0  12.000007  0.00003          4   
2018-11-01           1175    25.0  12.000007  0.00003       

In [15]:
# Create time series

df_cleaned = df_cleaned.sort_index()  #ordena o índice de modo a que a data esteja ordenada e dê para observar a evolução dos preços



layout = go.Layout(
    title = 'Time Series Plot of 4-aminopyridine (Price/g)',
    yaxis = dict(
        title = 'price(USD)'
    ),
    xaxis = dict(
        title = 'date'
    )
)

prov_list = []                        
for key, item in grouped_df:
    prov_list.append(key)                 # na lista das opções para a interação, colocar as chaves do groupby de cima, ou seja, os providers
    

d_x = {}
d_y = {}
for k,g in df_cleaned.groupby(['provider_name'])['media']: # agrupar a df por provider e a dar só os valores da média
    x = pd.to_datetime(g.index)                            # converter cada data num timestamp??  
    y = g.values                                           # y fica como sendo os valores da média
    if k not in d_x and k not in d_y:                      # se a data ainda não existir no dicionários fazer o seguinte
        d_x[k] = x                                           # colocar o valor x e o valor y nos respetivos dicionários
        d_y[k] = y
    elif k in d_x and k in d_y:                            # se a data já existir
        d_x[k].append(x)                                     # fazer um append, isto obriga a que o valor de uma chave seja de certa forma uma lista
        d_y[k].append(y)
    else: print('There is a provider lacking on the x or y dict.')
    

     

    

def update_plot(signals):#, freq):
   data = []                                              # todas as linhas do gráfico
   for s in signals:                                      # cada provider que está nas opções
       trace1 = go.Scatter(                               # fazer uma time series
           x = d_x[s],                                    # usar os valores de tempo para aquele provider
           y = d_y[s],                                    # usar os valores de média para aquele tempo (que está agrupado por avg)
           mode = 'lines+markers',
           name = 'provider {}'.format(s),
           line = dict(
               shape = 'spline' #smooth line
           )
       )
       data.append(trace1)
    
   fig = go.Figure(data = data, layout= layout)
   py.offline.iplot(fig)

    
signals = widgets.SelectMultiple(options = prov_list, description = 'Provider')
#freq = widgets.FloatSlider(min=1, max=20, value=1, description = 'Freq')
widgets.interactive(update_plot, signals = signals)#, freq = freq)

interactive(children=(SelectMultiple(description='Provider', options=('AA BLOCKS', 'ACC Corporation', 'AD Chem…

In [16]:
# Create Boxplot


df_cleaned = df_cleaned.sort_index()



layout = go.Layout(
    title = 'Time Series Plot of 4-aminopyridine (Price/g)',
    yaxis = dict(
        title = 'price (USD)'
    ),
    xaxis = dict(
        title = 'date'
    )
)



opt_list = []
d_x = {}
d_y = {}
# dates =[]   #lista com as datas exatas que existem na dataframe
for k,g in df_cleaned.groupby(df_cleaned.index)['media']:
    opt_list.append(k)
    x = pd.to_datetime(g.index)
#     dates.append(x)
    y = g.values
    if k not in d_x and k not in d_y:
        d_x[k] = x
        d_y[k] = y
    elif k in d_x and k in d_y:
        d_x[k].append(x)
        d_y[k].append(y)
    else: print('There is a provider lacking on the x or y dict.')
    
    
def update_plot(signals):#, freq):
   data = []
   for s in signals:
       trace1 = go.Box(
           x = d_x[s],
           y = d_y[s],
           name = 'time {}'.format(s),
           boxpoints='outliers',
           marker=dict(
               color='rgb(8,81,156)',
               outliercolor='rgba(219, 64, 82, 0.6)',
               line=dict(
                   outliercolor='rgba(219, 64, 82, 0.6)',
                   outlierwidth=2)),
           line_color='rgb(8,81,156)'
           
       )
       data.append(trace1)
    
   fig = go.Figure(data = data, layout= layout)
   py.offline.iplot(fig)

    
signals = widgets.SelectMultiple(options = opt_list, description = 'Time')
#freq = widgets.FloatSlider(min=1, max=20, value=1, description = 'Freq')
widgets.interactive(update_plot, signals = signals)#, freq = freq)

interactive(children=(SelectMultiple(description='Time', options=(Timestamp('2017-10-01 00:00:00'), Timestamp(…

Há providers que têm uma média muito muito baixa ( < 0.5). 
Tentar ver quantos providers são.

In [27]:
df.provider_name[df.media < 0.5].unique()

array(['ENAMINE Ltd.', 'Vitas-M Laboratory, Ltd.', 'Maybridge, Ltd.',
       'Tocris Bioscience', 'Otava, Ltd.', 'BIONET/Key Organics Ltd.',
       'Alinda Chemical, Ltd.', 'TargetMol', 'BIOTREND Chemicals, AG',
       'Manchester Organics', 'Apollo Scientific', 'Astatech Inc',
       'Manchester Organics Limited', 'Life Chemicals Inc.',
       'BIONET - Key Organics Ltd.', 'Selleck Chemicals LLC',
       'Alinda Chemical Trading Company', 'HTS Biochemie Innovationen'],
      dtype=object)

In [26]:
len(df.provider_name[df.media < 0.5].unique())

18

Ainda são 18 providers que têm uma média de preços muito baixa. Devemos eliminar-los? 