# Exercício 1
## Demanda do Especialista do Ministério da Agricultura
- Avaliação do Impacto de Fatores Climáticos na Produção Agrícola


---

Importando bibliotevas necessárias

In [127]:
import pandas as pd
from hashlib import sha1

### Lendo dados sobre área plantada

In [167]:
df_planted_area = pd.read_csv(
    filepath_or_buffer='data/raw/grains/planted_area.csv',
    sep=',',
    index_col=0,
)

Transformando tabela de tipo largo para longo

In [168]:
df_planted_area = pd.melt(
    frame=df_planted_area,
    id_vars=['Grain', 'Year'],
    var_name='city',
    value_name='planted_area'
)

Visualizando dados

In [169]:
df_planted_area.tail()

Unnamed: 0,Grain,Year,city,planted_area
2558975,Pea,2017,Brasília (DF),32.0
2558976,Barley,2018,Brasília (DF),100.0
2558977,Pea,2018,Brasília (DF),40.0
2558978,Barley,2019,Brasília (DF),100.0
2558979,Pea,2019,Brasília (DF),40.0


### Lendo dados sobre área colhida

In [170]:
df_harvested_area = pd.read_csv(
    filepath_or_buffer='data/raw/grains/harvested_area.csv',
    sep=',',
    index_col=0,
)

Transformando tabela de tipo largo para longo

In [171]:
df_harvested_area = pd.melt(
    frame=df_harvested_area,
    id_vars=['Grain', 'Year'],
    var_name='city',
    value_name='harvested_area'
)

Visualizando dados

In [172]:
df_harvested_area.tail()

Unnamed: 0,Grain,Year,city,harvested_area
2558975,Pea,2017,Brasília (DF),32.0
2558976,Barley,2018,Brasília (DF),100.0
2558977,Pea,2018,Brasília (DF),40.0
2558978,Barley,2019,Brasília (DF),70.0
2558979,Pea,2019,Brasília (DF),40.0


### Lendo dados sobre produtividade

In [173]:
df_yield = pd.read_csv(
    filepath_or_buffer='data/raw/grains/yield.csv',
    sep=',',
    index_col=0,
)

Transformando tabela de tipo largo para longo

In [174]:
df_yield = pd.melt(
    frame=df_yield,
    id_vars=['Grain', 'Year'],
    var_name='city',
    value_name='yield'
)

Visualizando dados

In [175]:
df_yield.tail()

Unnamed: 0,Grain,Year,city,yield
2558975,Pea,2017,Brasília (DF),7.0
2558976,Barley,2018,Brasília (DF),4.5
2558977,Pea,2018,Brasília (DF),7.0
2558978,Barley,2019,Brasília (DF),4.5
2558979,Pea,2019,Brasília (DF),7.0


### Lendo dados sobre produção

In [198]:
df_production = pd.read_csv(
    filepath_or_buffer='data/raw/grains/production.csv',
    sep=',',
    index_col=0
)

Transformando tabela de tipo largo para longo

In [199]:
df_production = pd.melt(
    frame=df_production,
    id_vars=['Grain', 'Year'],
    var_name='city',
    value_name='production'
)

Visualizando dados

In [210]:
df_production.tail()

Unnamed: 0,Grain,Year,city,production
2558975,Pea,2017,Brasília (DF),7.0
2558976,Barley,2018,Brasília (DF),4.5
2558977,Pea,2018,Brasília (DF),7.0
2558978,Barley,2019,Brasília (DF),4.5
2558979,Pea,2019,Brasília (DF),7.0


### Realizando cruzamento dos dados

Comparando se dados sobre produção e produtividade são diferentes

In [179]:
df_production['production'].equals(df_yield['yield'])

True

Neste dataset não são, logo podemos ficar com apenas um destes

Realizando mesca entre dados de plantio, colheita e produção

In [190]:
df_plantation = pd.merge(
    left=df_harvested_area,
    right=df_planted_area,
    how='inner',
    on=['Grain', 'Year', 'city']
)

df_plantation = pd.merge(
    left=df_plantation,
    right=df_production,
    how='left',
    on=['Grain', 'Year', 'city']
)

Extraindo unidade federativa do nome da cidade

In [191]:
df_plantation['state'] = df_plantation['city'].str.extract(r'\((\w{2})\)')

Removendo informação de unidade federativa da coluna de cidade

In [192]:
df_plantation['city'] = df_plantation['city'].str.replace(r'\(\w{2}\)', '', regex=True)
df_plantation['city'] = df_plantation['city'].str.strip()

Criando coluna com as regiões do brazil

In [202]:
brazil_regions = {
    'North': ['RO', 'AC', 'AM', 'RR', 'PA', 'AP', 'TO'],
    'Northeast': ['MA', 'PI', 'CE', 'RN', 'PB', 'PE', 'AL', 'SE', 'BA'],
    'Midwest': ['MT', 'MS', 'GO', 'DF'],
    'Southeast': ['MG', 'ES', 'RJ', 'SP'],
    'South': ['PR', 'SC', 'RS']
}

state_to_region = {state: region for region, states in brazil_regions.items() for state in states}

df_plantation['region'] = df_plantation['state'].map(state_to_region)

Criando chave artificial

In [203]:
df_plantation['artificial_key'] = (
    df_plantation['state']
    + df_plantation['city']
    + df_plantation['Grain']
    + df_plantation['Year'].astype(str)
).map(lambda x: sha1(x.encode('utf-8')).hexdigest())

Alterando nome de todas as colunas para _lower case_

In [204]:
df_plantation.columns = df_plantation.columns.map(lambda column: column.lower())

Alterando tipos das colunas

In [206]:
df_plantation = df_plantation.astype({
    'artificial_key': str,
    'region': str,
    'state': str,
    'city': str,
    'grain': str,
    'year': pd.UInt16Dtype(),
    'planted_area': pd.UInt64Dtype(),
    'harvested_area': pd.UInt64Dtype(),
    'production': pd.Float64Dtype(),
})

Alterando ordem das colunas

In [211]:
df_plantation = df_plantation.loc[:, ['artificial_key', 'region', 'state', 'city', 'grain', 'year', 'planted_area', 'harvested_area', 'production']]

Configurando novo índice para tabela

In [213]:
df_plantation.set_index('artificial_key', inplace=True)

Visualizando dados

In [221]:
df_plantation.tail()

Unnamed: 0_level_0,region,state,city,grain,year,planted_area,harvested_area,production
artificial_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f45ad7729c422c093bfbf0fb1f99f8c6ce6dbbf6,Midwest,DF,Brasília,Pea,2017,32,32,7.0
9a0f47a14c2e92a914b7b6e0d264ed576667edff,Midwest,DF,Brasília,Barley,2018,100,100,4.5
806f31b6182bbc90bb5e1d6d5196dc670359d217,Midwest,DF,Brasília,Pea,2018,40,40,7.0
baf47bd13e9014dd74b727e08ca13223252f4ce0,Midwest,DF,Brasília,Barley,2019,100,70,4.5
2ec357c46a3bb717daea073782baa6b42278d772,Midwest,DF,Brasília,Pea,2019,40,40,7.0


Salvando dados em formato .parquet particioando por ano

In [232]:
df_plantation.to_parquet(
    path='data/trusted/grains/plantation',
    engine='pyarrow',
    index=True,
    compression='snappy',
    partition_cols=['year']
)