* Name: Osvaldo Luiz dos Santos Pereira. 
* email: osvald23@gmail.com / osvaldo.pereira@sciencedata.ai / olsp@if.ufrj.br
* LinkedIn: https://www.linkedin.com/in/osvaldo-pereira
* Currículo Lattes: http://lattes.cnpq.br/6730251976463283
* Personal web page: https://www.sciencedata.ai/osvaldolspereira
* [Curriculum Vitae in PDF Format](https://www.sciencedata.ai/osvaldolspereira/wp-content/uploads/2020/09/CV_English_osvaldo_pereira.pdf)

# Introdução

Nesta aula iremos aprender sobre variáveis globais, e como fazer o "assign" de colunas e variáveis dinâmicamente usando a função global do Python.


# Requerimentos

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import random as rnd

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)

# Um pouco sobre listas

## Printando listas com for loop

In [2]:
# criando uma lista
lista = [1, 3, 5, 7, 9]

#
length = len(lista)

#
for i in range(length):
    print(lista[i])

1
3
5
7
9


## Printando listas com while loop

In [3]:
i = 0
while i < length:
    print(lista[i])
    i += 1

1
3
5
7
9


## Printando listas com list comprehension

In [4]:
[print(i) for i in lista]

1
3
5
7
9


[None, None, None, None, None]

## Criando tupla iterável

In [5]:
for i, value in enumerate(lista):
    print (i, ',' ,value)

0 , 1
1 , 3
2 , 5
3 , 7
4 , 9


# Dicionários e variáveis dinâmicas

## Criando lista com nomes e sufixos indexados

In [6]:
# inicializando uma lista
v_name = []

# inicializando uma lista
v_value = []

#
k = 4

#
for i in range(k):
    # cria lista com nomes das colunas
    v_name.append(str('col_' + str(i+1)))
    # cria lista de valores com base em uma função
    v_value.append(i^2 + 2*i + 1)

v_value = [int(i) for i in v_value]

array = np.column_stack((v_name,v_value))

array

array([['col_1', '3'],
       ['col_2', '4'],
       ['col_3', '5'],
       ['col_4', '10']], dtype='<U11')

## Usando dicionários

In [7]:
# inicializando dicionário
dicio = {}

for i in range(4):
    chave = str('x'+str(i))
    dicio[chave] = i**2
    
for key,value in dicio.items():
    exec(f'{key} = {value}')
    
dicio

{'x0': 0, 'x1': 1, 'x2': 4, 'x3': 9}

In [10]:
class vec():
    pass


k = 4
v = vec()


#
chaves = 'col1 col2 col3 col4'.split()

#
valores = 'val1 val2 val3 val4'.split()

for key,values in zip(chaves,valores):
    setattr(v,key,values)
    
v.__dict__

{'col1': 'val1', 'col2': 'val2', 'col3': 'val3', 'col4': 'val4'}

## setattr e getattr

In [61]:
class classe:
    atributo1 = 'val1'
    atributo2 = 2
    atributo3 = 0.5

#
x = getattr(classe,'atributo2')

#
setattr(classe, 'atributo2', 40)

#
y = getattr(classe, 'atributo2')

#
print('x = ', x, '\ny =', y)

x =  2 
y = 40


# Programação funcional e map

In [14]:
# cria uma lista e valores baseado numa lógica
iteravel = [i for i in range(5)]

# cria uma função baseado numa lógica 
def funcional(x):
    return x**2

# aplica a função map na função e na lista iterável
resultado1 = map(funcional, iteravel)

# gera a lista de valores aplicadas na função
list(resultado1)

[0, 1, 4, 9, 16]

In [16]:
resultado2 = [i**2 for i in range(5)]

resultado2

[0, 1, 4, 9, 16]

In [18]:
resultado3 = [funcional(x) for x in iteravel]

resultado3

[0, 1, 4, 9, 16]

# Compreensão de listas e dicionários

## Compreensão de listas (exemplo)

In [19]:
lista = ['col'+str(i) for i in range(4)]

lista

['col0', 'col1', 'col2', 'col3']

## Compreensão de dicionários (exemplo)

In [26]:
quadrados = {'x'+str(i): i * i for i in range(5)}

quadrados

{'x0': 0, 'x1': 1, 'x2': 4, 'x3': 9, 'x4': 16}

## Compreensão de arrays (exemplo)

In [35]:
matriz = [[i for i in range(5)] for j in range(6)]

matriz

[[0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4]]

In [41]:
matriz_unit = [[0 if i!=j else 1 for i in range(5)] for j in range(5)]

matriz_unit

[[1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1]]

In [46]:
vandermonde = [[i**j for i in range(5)] for j in (1,2,3,4,5)]

vandermonde

[[0, 1, 2, 3, 4],
 [0, 1, 4, 9, 16],
 [0, 1, 8, 27, 64],
 [0, 1, 16, 81, 256],
 [0, 1, 32, 243, 1024]]

# Criando pandas dataframe

In [74]:
import string

abc = string.ascii_lowercase

print(abc)
    
col1 = [i for i in abc]

col1

abcdefghijklmnopqrstuvwxyz


['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [98]:
#
n = 10

#
colunas = [i for i in 'abcde']

qtd_cols = len(colunas)

#
valores = [[rnd.random() for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,0.916151,0.025753,0.070676,0.072989,0.578618
1,0.924575,0.720495,0.126412,0.431161,0.960857
2,0.951136,0.716051,0.820812,0.319373,0.960693
3,0.156595,0.792206,0.098032,0.572935,0.081642
4,0.739901,0.167968,0.225014,0.671978,0.65822
5,0.685227,0.603106,0.328643,0.809331,0.602765
6,0.516057,0.500931,0.898296,0.587881,0.181034
7,0.929151,0.887283,0.224433,0.40551,0.901865
8,0.959845,0.018404,0.552788,0.352084,0.354032
9,0.92985,0.799185,0.366537,0.823348,0.162138


In [99]:
#
valores = [[rnd.randrange(10) for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,9,5,2,7,7
1,5,6,9,8,7
2,3,4,1,3,1
3,6,1,4,5,4
4,5,1,2,3,3
5,5,7,3,4,1
6,2,0,9,2,0
7,0,9,6,2,5
8,5,2,1,0,7
9,9,0,7,1,1


In [100]:
#
valores = [[rnd.randrange(0,100,2) for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,70,28,40,60,74
1,88,2,96,74,18
2,82,84,46,42,16
3,20,90,18,64,70
4,14,36,0,98,10
5,74,64,90,62,62
6,98,14,20,52,52
7,94,26,38,14,78
8,94,4,44,66,34
9,78,90,74,28,54


In [103]:
#
valores = [[rnd.randrange(1,100,2) for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,59,65,91,85,69
1,53,27,65,43,89
2,53,7,57,31,33
3,93,43,17,23,51
4,75,21,85,71,49
5,19,1,95,93,53
6,85,79,79,37,79
7,53,83,81,55,87
8,35,83,27,99,93
9,23,3,89,3,17


In [105]:
lista = ['M','F','?']

#
valores = [[rnd.choice(lista) for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,?,M,?,F,M
1,M,?,?,?,F
2,M,F,?,M,M
3,?,F,M,M,?
4,M,F,F,M,F
5,?,?,?,?,M
6,?,M,F,M,F
7,M,F,M,M,?
8,F,M,M,M,F
9,F,M,?,?,M


In [106]:
#
valores = [[rnd.uniform(2.5, 10.0) for i in range(qtd_cols)] for j in range(n)]

#
df = pd.DataFrame(data = valores, columns = colunas)

#
df

Unnamed: 0,a,b,c,d,e
0,2.660181,9.436722,9.061429,7.296805,3.955107
1,6.370762,8.654086,4.707545,4.984431,5.453521
2,8.87663,5.205876,4.936471,3.761063,3.447359
3,9.059885,3.767109,3.395472,9.451692,9.761096
4,3.941083,5.50537,4.984412,5.800436,6.570672
5,7.514137,7.48761,3.911789,7.573013,5.675249
6,3.939897,9.797134,9.076284,6.628986,7.405282
7,8.021292,3.177935,6.520465,5.781174,8.433707
8,4.005727,4.942205,4.854677,8.696187,8.305125
9,8.754704,6.790636,2.941301,6.907376,6.053579


In [193]:
#
n = 100

#
colunas = [i for i in 'abc']

qtd_cols = len(colunas)

lista_nome = []

#
for i in range(qtd_cols):
    globals()[f'val{i}'] = np.array([rnd.random() for j in range(n)])
    lista_nome.append(f'val{i}')

#
df = pd.DataFrame(data = list(zip(val1,val2,val3)), columns = colunas)

#
df

Unnamed: 0,a,b,c
0,0.608052,0.803802,0.811528
1,0.335656,0.619446,0.879622
2,0.021672,0.294737,0.905177
3,0.553473,0.052339,0.937720
4,0.613235,0.263342,0.046770
...,...,...,...
95,0.298478,0.331581,0.214239
96,0.565765,0.125825,0.636423
97,0.325405,0.289795,0.332985
98,0.482400,0.728728,0.438815


# Desafio

## Resposta esperada

In [114]:
def cria_df(k,n):
    
    # inicializa um dataframe
    df = pd.DataFrame()
       
    # inicializa uma lista com os nomes das colunas do dataframe
    x = []
    
    # inicializa uma lista com os valores guardados das médias das colunas do dataframe
    mu_x = []
    
    # inicializa uma lista com os valores de desvios padrões das colunas do dataframe
    sd_x = []
    
    # inicializa uma lista com os nomes das variáveis que guardam os valores das médias
    nome_mu = []
    
    # inicializa uma lista com os nomes das variáveis que guardam os valores dos desvios padrões
    nome_sd = []
    
    # script que cria todas as listas inicializadas
    for i in range(3):
        
        # cria a lista de colunas do dataframe
        x.append(str('x_' + str(i+1)))
        
        # pede o input dos valores das médias para a distribuição de cada coluna do dataframe
        globals()[f'mu_x{i+1}'] = int(input(f'valor da media {i+1}: '))
        
        # pede o input dos valores dos desvios para a distribuição de cada coluna do dataframe
        globals()[f'sd_x{i+1}'] = int(input(f'valor do desvio {i+1}: '))
        
        # cria a lista com valores das médias para cada coluna do dataframe
        nome_mu.append(globals()[f'mu_x{i+1}']) 
        
        # cria a lista com valores dos desvios para cada coluna do dataframe
        nome_sd.append(globals()[f'sd_x{i+1}'])
        
        # cria a lista com variáveis dos valores médios
        mu_x.append(globals()[f'mu_x{i+1}'])
        
        # cria a lista com variáveis dos valores de desvios
        sd_x.append(globals()[f'sd_x{i+1}'])
        
        # gera as colunas do dataframe com distribuições normais dadas pelas médias e desvios inputadas
        globals()[f'v{i}'] = np.round(np.random.normal(mu_x[i], sd_x[i], n), decimals = 0).astype(int)
        
        # cria o dataframe
        df[str('x_' + str(i+1))] = globals()[f'v{i}']
    
    # return das principais variáveis/objetos do script
    return df, x, mu_x, sd_x, nome_mu, nome_sd

## Execução da função e criação dos objetos

In [116]:
df, x, mu_x, sd_x, nome_mu, nome_sd = cria_df(k = 3, n = 100)

valor da media 1: 100
valor do desvio 1: 10
valor da media 2: 200
valor do desvio 2: 20
valor da media 3: 300
valor do desvio 3: 30


## Cara do dataframe

In [119]:
df.head(3)

Unnamed: 0,x_1,x_2,x_3
0,111,186,356
1,110,176,330
2,116,202,323


## Lista com valores guardados dos nomes das colunas

In [118]:
x

['x_1', 'x_2', 'x_3']

## Lista que guarda os valores das médias de cada coluna do dataframe

In [120]:
mu_x

[100, 200, 300]

## Lista que guarda os valores dos desvios de cada coluna do dataframe

In [121]:
sd_x

[10, 20, 30]

## Lista que guarda as variáveis de média de cada coluna do dataframe

In [122]:
nome_mu

[100, 200, 300]

## Lista que guarda as variáveis de desvio de cada coluna do dataframe

In [123]:
nome_sd

[10, 20, 30]

# Resposta Vencedora #1

* **Autor**: Eduardo Salis
* **Github**: https://github.com/rafaelktakahashi
* **Referência**: [Stackoverflow - Create an array with a pre determined mean and standard deviation](https://stackoverflow.com/questions/50177594/create-an-array-with-a-pre-determined-mean-and-standard-deviation)

In [202]:
import numpy as np 
import pandas as pd

def colunas_distrib(k,n_linhas):
    df = pd.DataFrame()

    for i in range(k+1):
        i_str = str(i)
        nome_coluna = f'coluna_{i_str}'
        df[nome_coluna] = np.random.normal(loc=i, scale=i, size=n_linhas)

    return df

df = colunas_distrib(3,10)

df

Unnamed: 0,coluna_0,coluna_1,coluna_2,coluna_3
0,0.0,0.850501,0.55876,-0.580568
1,0.0,1.71705,0.351573,4.899832
2,0.0,1.53514,4.052563,2.00866
3,0.0,-1.095105,1.63169,2.145503
4,0.0,0.875091,1.477907,2.842483
5,0.0,1.84589,-1.53174,3.953141
6,0.0,2.268889,4.683234,0.651736
7,0.0,-0.395129,1.561913,2.408287
8,0.0,1.09837,3.234296,5.142125
9,0.0,1.359703,4.731387,7.044202


# Resposta vencedora #2

* **Autor**: Rafael Takahashi
* **Github**: https://github.com/rafaelktakahashi

In [199]:
import pandas as pd
import numpy as np


def create_series(n, k, *stats):
    "pass number of rows n, columns k and stats in order as a list: [mean, std]"
    print (n, 'linhas e', k, 'colunas')
    
    #c_names, series, stts = [], [], []
    df = pd.DataFrame()
        
    for i, j in zip(range(1, k+1), stats):
        df["coluna_"+str(i)] = np.random.normal(loc=j[0], scale=j[1], size=n)
    
    global mean_std
    mean_std = []
    for column in df:
        c_mean = np.mean(df[column])
        c_std = np.std(df[column])
        mean_std.append([column, c_mean, c_std])
        print ('mean and std of', column, '-', c_mean, c_std)
    
    return df, mean_std

# exemplo:
df, mean_std = create_series(100, 3, [0, 1], [1, 2], [2, 3])

#
df

100 linhas e 3 colunas
mean and std of coluna_1 - 0.14192381767084916 1.0596799260236922
mean and std of coluna_2 - 0.7718687456532989 1.920909052769867
mean and std of coluna_3 - 1.8023610977454658 3.2674399833626686


Unnamed: 0,coluna_1,coluna_2,coluna_3
0,0.264197,0.078869,4.011957
1,1.576306,0.225306,0.300522
2,1.508513,5.413983,4.275097
3,0.273428,3.860047,3.795771
4,0.181396,-0.337082,-2.522320
...,...,...,...
95,-0.608583,1.315083,5.648939
96,0.069744,0.295827,4.277353
97,-0.382502,-0.673026,5.583894
98,0.532248,3.966587,4.412593


# Referências

* [Create a Pandas DataFrame from Lists](https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-lists/)
* [How to Dynamically Declare Variables Inside a Loop in Python](https://python.plainenglish.io/how-to-dynamically-declare-variables-inside-a-loop-in-python-21e6880aaf8a)
* [When to Use a List Comprehension in Python](https://realpython.com/list-comprehension-python/)
* [how to print the for loop index with the input statement?](https://stackoverflow.com/questions/61960949/how-to-print-the-for-loop-index-with-the-input-statement)
* [Python Dynamic Variable Name](https://www.delftstack.com/howto/python/python-dynamic-variable-name/)
* [Get Column Names as List in Pandas DataFrame](https://datascienceparichay.com/article/get-column-names-as-list-in-pandas-dataframe/)
* [List comprehension for first 10 alphabets](https://stackoverflow.com/questions/34101161/list-comprehension-for-first-10-alphabets)
* [Python library random](https://docs.python.org/3/library/random.html)