In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import csv
import warnings
import glob

In [2]:
warnings.filterwarnings("ignore")

In [3]:
def plot_columns(df):
    colors = plt.cm.tab20(range(df.shape[1]))  

    for i, column in enumerate(df.columns):
        color = colors[i % 24]

        plt.figure()
        plt.plot(df.index, df[column], color=color)
        plt.xlabel('Ano')
        plt.ylabel('Valor')
        plt.title(column)

    plt.show()

In [4]:
def centered_moving_average(column, window_action, window_size):
    start_index, end_index = window_action
    data = column.to_numpy()

    data_length = len(data)

    for i in range(start_index, end_index + 1):
        if not np.isnan(data[i]):
            init = data[i]
            start = max(0, i - window_size)
            end = min(data_length, i + window_size + 1)
            window = data[start:end]
            valid_values = [value for value in window if not pd.isna(value)]
            if len(valid_values) > 0:
                data[i] = np.mean(valid_values)

    modified_column = pd.Series(data, name=column.name)

    return modified_column

In [5]:
# Linear Interpolation 
def ends_filler(column):
    if  column.dtype.kind in ['i', 'f']:
            column = first_filler(column)
            column = last_filler(column)
    return column

def first_filler(column):
    if pd.isnull(column.at[0]) or np.isnan(column.at[0]):
        size = len(column) -1
        first_valid_index = 0
        while first_valid_index <= size and (pd.isnull(column.at[first_valid_index]) or np.isnan(column.at[first_valid_index])):
            first_valid_index += 1
        if first_valid_index > size:
            column.at[0] = 0.0
        else:
            column.at[0] = column.at[first_valid_index]
    return column
    
def last_filler(column):
    if pd.isnull(column.at[len(column) -1]) or np.isnan(column.at[len(column) -1]):  
        last_valid_index = len(column) -1
        while last_valid_index >= 0 and (pd.isnull(column.at[last_valid_index]) or np.isnan(column.at[last_valid_index])):
            last_valid_index -= 1
        if last_valid_index < 0:
            column.at[len(column) -1] = 0.0
        else:
            column.at[len(column) -1] = column.at[last_valid_index]
    return column
    
def prev_valid_index(column, pos):
    prev_index = pos
    size = len(column) -1
    if pos > 0 and pos < size:
        while prev_index >= 0 and (pd.isnull(column.at[prev_index]) or np.isnan(column.at[prev_index])):
            prev_index -= 1
    return prev_index  

def next_valid_index(column, pos):
    next_index = pos
    size = len(column) -1
    if pos > 0 and pos < size:
        while next_index <= size and (pd.isnull(column.at[next_index]) or np.isnan(column.at[next_index])):
            next_index += 1
    return next_index

def linear_filler(column):
    if column.isnull().any():
        for i, value in enumerate(column):
            if i > 0 and (pd.isnull(value) or np.isnan(value) or (value == '')):
                start = prev_valid_index(column, i)
                end = next_valid_index(column, i)
                gap = list(range(start + 1, end))
                linear_a = (column.at[end] - column.at[start])/(end - start)
                linear_b = column.at[start] - linear_a*start
                for x in gap:
                    column.at[x] = linear_a*x + linear_b
    return column 

In [6]:
def read_csv_replace_missing_by_linear(file_path, skiped_rows):
    with open(file_path, 'r') as f:
        
        df = pd.read_csv(file_path, skiprows=skiped_rows)
        
        i = 0
        for col in df.columns: # Elimina missing values por médias e transforma em float
            if i > 0:
                df[col] = df[col].replace('-9999', np.nan)
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].replace('', np.nan).astype(float)
                df[col] = ends_filler(df[col])
                df[col] = linear_filler(df[col])
                df[col] = centered_moving_average(df[col], (0, len(df[col]) - 1), 24) #2 anos
            i += 1
        return df

In [7]:
df = pd.read_csv("producao_cimento_mensal.csv")
df

Unnamed: 0,Ano,Rondônia - Produção de Cimento (t),Acre - Produção de Cimento (t),Amazonas - Produção de Cimento (t),Roraima - Produção de Cimento (t),Pará - Produção de Cimento (t),Amapá - Produção de Cimento (t),Tocantins - Produção de Cimento (t),anhã-03o - Produção de Cimento (t),Piauí - Produção de Cimento (t),...,Espírito Santo - Produção de Cimento (t),Rio De eiro-01 - Produção de Cimento (t),São Paulo - Produção de Cimento (t),Paraná - Produção de Cimento (t),Santa Catarina - Produção de Cimento (t),Rio Grande Do Sul - Produção de Cimento (t),Mato Grosso - Produção de Cimento (t),Mato Grosso Do Sul - Produção de Cimento (t),Goiás - Produção de Cimento (t),Distrito Federal - Produção de Cimento (t)
0,2003-01,,,43.382,,48.623,,,28.018,24.654,...,92.683,187.848,477.117,303.601,27.924,145.214,42.565,47.337,42.953,126.390
1,2003-02,,,42.997,,46.078,,,14.332,18.925,...,105.642,195.615,453.300,311.711,22.293,137.995,40.029,51.606,44.601,123.096
2,2003-03,,,51.061,,49.168,,,14.467,20.817,...,101.219,190.508,473.540,342.078,24.535,145.249,41.516,53.027,49.219,131.225
3,2003-04,,,47.683,,43.070,,,14.994,27.865,...,94.729,194.587,392.761,323.295,22.897,148.079,39.776,45.130,47.291,110.056
4,2003-05,,,48.669,,46.143,,,19.965,26.858,...,103.394,224.502,443.012,349.634,27.046,144.405,48.527,47.487,46.127,136.397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,2022-08,24.244,,44.693,,91.763,,72.450,34.701,,...,69.833,243.725,530.887,680.375,175.057,137.165,164.358,85.640,208.188,297.418
236,2022-09,24.253,,44.998,,73.627,,85.290,37.580,,...,67.098,243.967,515.169,647.041,142.040,134.735,143.322,64.120,210.928,259.645
237,2022-10,24.361,,39.935,,74.401,,92.635,33.576,,...,65.730,251.496,507.849,600.689,159.737,145.818,150.847,66.870,217.502,271.658
238,2022-11,20.812,,36.284,,83.422,,68.654,38.146,,...,58.759,225.893,504.378,635.646,164.222,141.853,110.200,72.857,124.710,239.262


In [8]:
df = read_csv_replace_missing_by_linear("producao_cimento_mensal.csv",0)
df

Unnamed: 0,Ano,Rondônia - Produção de Cimento (t),Acre - Produção de Cimento (t),Amazonas - Produção de Cimento (t),Roraima - Produção de Cimento (t),Pará - Produção de Cimento (t),Amapá - Produção de Cimento (t),Tocantins - Produção de Cimento (t),anhã-03o - Produção de Cimento (t),Piauí - Produção de Cimento (t),...,Espírito Santo - Produção de Cimento (t),Rio De eiro-01 - Produção de Cimento (t),São Paulo - Produção de Cimento (t),Paraná - Produção de Cimento (t),Santa Catarina - Produção de Cimento (t),Rio Grande Do Sul - Produção de Cimento (t),Mato Grosso - Produção de Cimento (t),Mato Grosso Do Sul - Produção de Cimento (t),Goiás - Produção de Cimento (t),Distrito Federal - Produção de Cimento (t)
0,2003-01,4.987000,0.0,51.313080,0.0,49.618560,0.0,12.733000,23.368400,24.669760,...,98.171240,199.081280,437.972720,332.705560,20.216800,135.543840,50.917240,53.166560,56.459960,156.625720
1,2003-02,4.987000,0.0,51.554195,0.0,49.658098,0.0,12.733000,23.167977,25.167145,...,97.897317,198.561780,434.363720,332.945945,19.631031,134.468148,50.851586,53.245714,56.789152,157.678528
2,2003-03,4.987000,0.0,51.523306,0.0,49.906876,0.0,12.733000,23.563273,25.520404,...,98.070539,198.773336,433.350868,335.222130,19.400920,134.242963,50.970549,52.994640,57.140263,159.190268
3,2003-04,4.987000,0.0,51.639628,0.0,49.864305,0.0,12.733000,24.008345,25.519725,...,98.460789,198.753800,431.825332,334.785737,19.153741,133.169391,51.049978,53.019283,57.649192,160.688376
4,2003-05,4.987000,0.0,51.838456,0.0,50.179753,0.0,12.733000,24.366896,25.929312,...,99.084134,198.721455,433.287298,335.050185,18.912155,132.046874,51.567529,53.539593,58.213295,162.847410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,2022-08,20.931530,0.0,39.352061,0.0,75.899018,0.0,67.097178,31.724066,12.957215,...,66.919726,232.917513,515.306495,616.073841,155.321494,133.034199,119.544326,72.097125,175.674487,250.077918
236,2022-09,20.812259,0.0,39.160630,0.0,75.416567,0.0,67.077411,31.679231,12.965575,...,66.756019,232.658000,514.812591,614.423079,154.790929,133.001080,118.223448,71.657488,174.614843,248.545664
237,2022-10,20.687985,0.0,38.937187,0.0,75.559167,0.0,66.563807,31.524870,12.977734,...,66.684567,232.339116,514.856452,613.772150,155.428377,133.052788,117.524152,71.966263,173.307223,248.265413
238,2022-11,20.551024,0.0,38.889598,0.0,75.696005,0.0,65.682099,31.512212,12.994123,...,66.664258,231.667333,515.215930,614.761140,155.448086,132.665535,116.430559,72.205635,171.556538,247.486640


In [9]:
df_year = df.copy()
df_year['Ano'] = df_year['Ano'].str[:4]
df_year = df_year.groupby('Ano').sum()
df_year.index = df_year.index.astype(int)
df_year

Unnamed: 0_level_0,Rondônia - Produção de Cimento (t),Acre - Produção de Cimento (t),Amazonas - Produção de Cimento (t),Roraima - Produção de Cimento (t),Pará - Produção de Cimento (t),Amapá - Produção de Cimento (t),Tocantins - Produção de Cimento (t),anhã-03o - Produção de Cimento (t),Piauí - Produção de Cimento (t),Ceará - Produção de Cimento (t),...,Espírito Santo - Produção de Cimento (t),Rio De eiro-01 - Produção de Cimento (t),São Paulo - Produção de Cimento (t),Paraná - Produção de Cimento (t),Santa Catarina - Produção de Cimento (t),Rio Grande Do Sul - Produção de Cimento (t),Mato Grosso - Produção de Cimento (t),Mato Grosso Do Sul - Produção de Cimento (t),Goiás - Produção de Cimento (t),Distrito Federal - Produção de Cimento (t)
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003,59.844,0.0,623.227503,0.0,618.075773,0.0,152.796,293.705283,317.286319,1338.225887,...,1197.520265,2377.549731,5232.682934,4018.882255,226.842795,1576.618236,620.268108,640.534672,707.997208,2006.510747
2004,59.844,0.0,631.482259,0.0,678.161979,0.0,152.796,308.130986,351.667485,1388.230832,...,1241.084984,2400.424606,5521.391778,4010.031842,221.696363,1530.612911,624.909873,650.203551,747.779683,2207.532012
2005,59.844,0.0,637.838656,0.0,747.048967,0.0,152.796,308.093212,357.819089,1438.751582,...,1360.910937,2511.833086,6102.302057,4028.76863,223.704012,1526.253731,653.029381,663.50115,773.277269,2364.080883
2006,59.844,0.0,664.588528,0.0,874.128576,0.0,152.796,322.991255,386.232189,1543.865085,...,1619.896813,2676.431443,6788.458005,4330.599545,265.456919,1625.631583,727.816146,712.049517,813.650774,2552.261933
2007,64.041487,0.0,691.804806,0.0,1030.601051,0.0,156.499264,339.260912,445.515113,1620.180069,...,1900.181823,2858.554649,7337.194102,4749.071195,322.873579,1786.663086,812.735644,783.793548,924.314058,2607.657436
2008,150.448962,0.0,694.929186,0.0,1172.693755,0.0,199.994423,352.164879,510.644488,1697.104012,...,2065.641254,3010.012911,7770.92933,5058.37726,354.241699,1892.120992,869.834453,831.774473,1049.218132,2685.923746
2009,336.177608,0.0,715.226516,0.0,1310.052663,0.0,305.829017,363.707506,564.654167,1747.798488,...,2200.12774,3108.577595,8316.504885,5346.288081,431.347926,1987.964633,949.083048,863.877907,1167.8385,2935.35756
2010,502.201514,0.0,753.12562,0.0,1443.581473,0.0,421.327526,424.05286,595.360558,1745.258962,...,2376.087362,3181.826392,8835.509117,5527.794738,792.441715,2093.049582,1038.662223,916.677483,1306.390438,3227.460076
2011,583.009861,0.0,768.056058,0.0,1536.079376,0.0,483.313605,558.996171,612.06359,1751.304575,...,2497.788723,3209.638424,9192.769768,5623.811148,1268.083588,2163.054486,1134.526356,958.130317,1430.447037,3532.866309
2012,560.443805,0.0,750.56471,0.0,1569.763418,0.0,510.546944,688.169638,616.463496,1769.225958,...,2521.227215,3199.16978,9435.198835,5779.04424,1549.260439,2145.073736,1291.161206,956.016292,1500.429389,3799.093892


In [10]:
df_2014 = df_year[df_year.index >= 2014]
df_2014

Unnamed: 0_level_0,Rondônia - Produção de Cimento (t),Acre - Produção de Cimento (t),Amazonas - Produção de Cimento (t),Roraima - Produção de Cimento (t),Pará - Produção de Cimento (t),Amapá - Produção de Cimento (t),Tocantins - Produção de Cimento (t),anhã-03o - Produção de Cimento (t),Piauí - Produção de Cimento (t),Ceará - Produção de Cimento (t),...,Espírito Santo - Produção de Cimento (t),Rio De eiro-01 - Produção de Cimento (t),São Paulo - Produção de Cimento (t),Paraná - Produção de Cimento (t),Santa Catarina - Produção de Cimento (t),Rio Grande Do Sul - Produção de Cimento (t),Mato Grosso - Produção de Cimento (t),Mato Grosso Do Sul - Produção de Cimento (t),Goiás - Produção de Cimento (t),Distrito Federal - Produção de Cimento (t)
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014,357.677944,0.0,629.906817,0.0,1427.236076,0.0,534.422193,687.203882,534.569599,1936.472431,...,2184.851676,2795.256292,8264.309556,5788.881494,1598.981415,1855.561669,1268.814714,886.944606,1413.139667,3551.813908
2015,276.938163,0.0,585.627424,0.0,1386.312966,0.0,515.383793,630.565434,485.544522,2117.547255,...,1939.379336,2440.539373,7069.711323,5749.529291,1488.521029,1702.806499,1123.345627,822.691374,1402.398186,3186.773594
2016,239.494881,0.0,527.123225,0.0,1292.085356,0.0,535.186108,538.170446,383.195013,2241.119301,...,1632.640239,2231.595625,6155.235871,5786.537903,1430.320858,1556.399041,1075.219903,737.430411,1460.206906,2900.209377
2017,226.902776,0.0,447.66544,0.0,1128.42489,0.0,580.856957,440.490154,274.263754,2243.972991,...,1275.568674,2351.836251,5693.780428,5902.212019,1433.051517,1456.19422,1111.839984,688.444085,1589.529694,2730.619877
2018,216.336406,0.0,427.76222,0.0,977.128293,0.0,599.811915,379.224267,214.602396,2244.384216,...,995.354714,2556.857799,5550.699605,6085.585319,1463.153616,1416.372254,1139.872508,702.799997,1731.93456,2633.351748
2019,233.109363,0.0,464.893117,0.0,897.035649,0.0,654.754815,358.628076,182.065812,2418.807067,...,869.405656,2665.298375,5770.714254,6533.257168,1592.460639,1462.841074,1211.033241,762.792596,1895.220271,2741.33135
2020,246.643166,0.0,474.980247,0.0,879.19261,0.0,735.747847,359.150605,164.951348,2641.33129,...,826.251278,2738.848482,6097.048581,7078.134333,1768.446223,1543.713344,1314.665925,834.473939,2042.47452,2906.704912
2021,244.246981,0.0,468.326664,0.0,897.36164,0.0,778.679012,369.064602,157.369748,2791.618748,...,810.222414,2784.919503,6203.315351,7344.760727,1855.413058,1580.761098,1390.580529,865.320803,2103.272798,2968.281625
2022,249.207038,0.0,470.48383,0.0,908.295113,0.0,796.102588,376.891747,155.633436,2845.448417,...,803.434154,2789.128743,6190.942191,7392.021558,1866.444211,1589.078587,1424.317956,869.49175,2106.312452,2994.398412


In [13]:
for column in df_2014.columns:
    file_name = f"2014_01_cons_prod_{column}.csv"
    column_df = df_2014[[column]]
    column_df.to_csv(file_name, index=True)

In [11]:
df_2008 = df_year[df_year.index >= 2008]
df_2008

Unnamed: 0_level_0,Rondônia - Produção de Cimento (t),Acre - Produção de Cimento (t),Amazonas - Produção de Cimento (t),Roraima - Produção de Cimento (t),Pará - Produção de Cimento (t),Amapá - Produção de Cimento (t),Tocantins - Produção de Cimento (t),anhã-03o - Produção de Cimento (t),Piauí - Produção de Cimento (t),Ceará - Produção de Cimento (t),...,Espírito Santo - Produção de Cimento (t),Rio De eiro-01 - Produção de Cimento (t),São Paulo - Produção de Cimento (t),Paraná - Produção de Cimento (t),Santa Catarina - Produção de Cimento (t),Rio Grande Do Sul - Produção de Cimento (t),Mato Grosso - Produção de Cimento (t),Mato Grosso Do Sul - Produção de Cimento (t),Goiás - Produção de Cimento (t),Distrito Federal - Produção de Cimento (t)
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008,150.448962,0.0,694.929186,0.0,1172.693755,0.0,199.994423,352.164879,510.644488,1697.104012,...,2065.641254,3010.012911,7770.92933,5058.37726,354.241699,1892.120992,869.834453,831.774473,1049.218132,2685.923746
2009,336.177608,0.0,715.226516,0.0,1310.052663,0.0,305.829017,363.707506,564.654167,1747.798488,...,2200.12774,3108.577595,8316.504885,5346.288081,431.347926,1987.964633,949.083048,863.877907,1167.8385,2935.35756
2010,502.201514,0.0,753.12562,0.0,1443.581473,0.0,421.327526,424.05286,595.360558,1745.258962,...,2376.087362,3181.826392,8835.509117,5527.794738,792.441715,2093.049582,1038.662223,916.677483,1306.390438,3227.460076
2011,583.009861,0.0,768.056058,0.0,1536.079376,0.0,483.313605,558.996171,612.06359,1751.304575,...,2497.788723,3209.638424,9192.769768,5623.811148,1268.083588,2163.054486,1134.526356,958.130317,1430.447037,3532.866309
2012,560.443805,0.0,750.56471,0.0,1569.763418,0.0,510.546944,688.169638,616.463496,1769.225958,...,2521.227215,3199.16978,9435.198835,5779.04424,1549.260439,2145.073736,1291.161206,956.016292,1500.429389,3799.093892
2013,460.001047,0.0,694.726016,0.0,1509.35799,0.0,530.756991,719.208496,585.346961,1802.564432,...,2409.53271,3078.657877,9197.980881,5855.68523,1645.417258,2014.296886,1371.926397,928.918952,1468.42945,3818.436361
2014,357.677944,0.0,629.906817,0.0,1427.236076,0.0,534.422193,687.203882,534.569599,1936.472431,...,2184.851676,2795.256292,8264.309556,5788.881494,1598.981415,1855.561669,1268.814714,886.944606,1413.139667,3551.813908
2015,276.938163,0.0,585.627424,0.0,1386.312966,0.0,515.383793,630.565434,485.544522,2117.547255,...,1939.379336,2440.539373,7069.711323,5749.529291,1488.521029,1702.806499,1123.345627,822.691374,1402.398186,3186.773594
2016,239.494881,0.0,527.123225,0.0,1292.085356,0.0,535.186108,538.170446,383.195013,2241.119301,...,1632.640239,2231.595625,6155.235871,5786.537903,1430.320858,1556.399041,1075.219903,737.430411,1460.206906,2900.209377
2017,226.902776,0.0,447.66544,0.0,1128.42489,0.0,580.856957,440.490154,274.263754,2243.972991,...,1275.568674,2351.836251,5693.780428,5902.212019,1433.051517,1456.19422,1111.839984,688.444085,1589.529694,2730.619877


In [12]:
for column in df_2008.columns:
    file_name = f"2008_01_cons_prod_{column}.csv"
    column_df = df_2008[[column]]
    column_df.to_csv(file_name, index=True)