In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import csv
import warnings

In [2]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)

In [3]:
def plot_columns(df):
    colors = plt.cm.tab20(range(df.shape[1]))  

    for i, column in enumerate(df.columns):
        color = colors[i % 24]

        plt.figure()
        plt.plot(df.index, df[column], color=color)
        plt.xlabel('Ano')
        plt.ylabel('Valor')
        plt.title(column)
        plt.xticks(rotation=45)

    plt.show()
    
    
def plot_linear_regression(column, linear_a, linear_b, linear_a_half, linear_b_half):
    plt.figure(figsize=(10, 6))
    plt.scatter(column.index, column, label='Original Values')
    plt.plot(column.index, linear_a * column.index + linear_b, color='red', label='Linear Regression')
    plt.plot(column.index, linear_a_half * column.index + linear_b_half, color='green', label='Recent Linear Regression')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.title('Original Values vs. Linear Regression vs. Recent Linear Regression')
    plt.grid(True)
    plt.show()

In [4]:
def centered_moving_average(column, window_action, window_size):
    start_index, end_index = window_action
    data = column.to_numpy()

    data_length = len(data)

    for i in range(start_index, end_index + 1):
        if not np.isnan(data[i]):
            init = data[i]
            start = max(0, i - window_size)
            end = min(data_length, i + window_size + 1)
            window = data[start:end]
            valid_values = [value for value in window if not pd.isna(value)]
            if len(valid_values) > 0:
                data[i] = np.mean(valid_values)

    modified_column = pd.Series(data, name=column.name)

    return modified_column

In [5]:
# Polynomial Interpolation data inputation method
def four_ends_filler(column):
    if  pd.api.types.is_numeric_dtype(column):
        for i in range(4):
            column = first_filler(column, i, len(column) - 1 - i)
            column = last_filler(column, i, len(column) - 1 - i)
    return column

def four_ends_linear_filler(column):
    if  pd.api.types.is_numeric_dtype(column):
        size = len(column) - 1
        column = first_filler(column, 0, size)
        column = last_filler(column, 0, size)
        for i in range(1, 4):
            value = column.at[i]
            if (pd.isnull(value) or np.isnan(value) or (value == '')):
                start_first = i - 1
                end_first = next_valid_index(column, i)
                linear_a = (column.at[end_first] - column.at[i - 1])/(end_first - start_first)
                linear_b = column.at[start_first] - linear_a*start_first
                column.at[i] = linear_a*i + linear_b
            x = size - 4 + i
            value = column.at[x]
            if (pd.isnull(value) or np.isnan(value) or (value == '')):
                start_last = prev_valid_index(column, x)
                end_last = next_valid_index(column, x)
                gap = list(range(start_last + 1, end_last))
                linear_a = (column.at[end_last] - column.at[start_last])/(end_last - start_last)
                linear_b = column.at[start_last] - linear_a*start_last
                column.at[x] = linear_a*x + linear_b
    return column

def first_filler(column, init, end):
    if pd.isnull(column.at[init]) or np.isnan(column.at[init]):
        first_valid_index = 0
        while first_valid_index <= end and (pd.isnull(column.at[first_valid_index]) or 
                                             np.isnan(column.at[first_valid_index])):
            first_valid_index += 1
        if first_valid_index > end:
            column.at[init] = 0.0
        else:
            column.at[init] = column.at[first_valid_index]
    return column

def extract_valid_values(column):
    valid_values = []
    valid_indexes = []
    for index, value in enumerate(column):
        if not pd.isnull(value) and not np.isnan(value):
            valid_values.append(value)
            valid_indexes.append(index)
    extracted_column = pd.Series(
        valid_values,
        index=valid_indexes
    )
    return extracted_column

def linear_regression_first_four_filler(column):
    size = len(column) - 1
    processed_column = extract_valid_values(column)
    mean_valid_values_quarter = processed_column.mean()/4
    mean_valid_values_quarter_upper = processed_column.mean() + mean_valid_values_quarter
    
    model = sm.OLS(processed_column, sm.add_constant(processed_column.index))
    result = model.fit()
    linear_a = result.params[1]
    linear_b = result.params['const']
    half_column = processed_column[:(len(processed_column) // 2)]
    model2 = sm.OLS(half_column, sm.add_constant(half_column.index))
    result2 = model2.fit()
    linear_a_half = result2.params[1]
    linear_b_half = result2.params['const']
    
    for i in range(0,4):
        if pd.isnull(column.at[i]) or np.isnan(column.at[i]):
            projected = ((linear_a*i + linear_b) + (linear_a_half*i + linear_b_half))/2
            if projected < mean_valid_values_quarter:
                column.at[i] = mean_valid_values_quarter
            elif projected > mean_valid_values_quarter_upper:
                column.at[i] = mean_valid_values_quarter_upper
            else:
                column.at[i] = projected
        x = size - 4 + i
        if pd.isnull(column.at[x]) or np.isnan(column.at[x]):
            projected = ((linear_a*i + linear_b) + (linear_a_half*i + linear_b_half))/2
            if projected < mean_valid_values_quarter:
                column.at[x] = mean_valid_values_quarter
            elif projected > mean_valid_values_quarter_upper:
                column.at[x] = mean_valid_values_quarter_upper
            else:
                column.at[x] = projected
    plot_linear_regression(column, linear_a, linear_b, linear_a_half, linear_b_half)
    return column
    
def last_filler(column, init, end):
    if pd.isnull(column.at[end]) or np.isnan(column.at[end]):
        last_valid_index = len(column) -1
        while last_valid_index >= init and (pd.isnull(column.at[last_valid_index]) or 
                                         np.isnan(column.at[last_valid_index])):
            last_valid_index -= 1
        if last_valid_index < init:
            column.at[end] = 0.0
        else:
            column.at[end] = column.at[last_valid_index]
    return column
  
def neareast_four_indexes(column, start, end):
    return prev_four_valid_indexes(column, start) + next_four_valid_indexes(column, end)

def prev_four_valid_indexes(column, pos):
    prev_index = pos
    index_prev_list = [0]*4
    size = len(column) - 1
    if pos > 0 and pos < size:
        for i in range(4):
            index_prev_list[3 - i] = prev_index
            prev_index -= 1
            while prev_index >= 0 and (pd.isnull(column.at[prev_index]) or np.isnan(column.at[prev_index])):
                prev_index -= 1
    return index_prev_list  

def next_four_valid_indexes(column, pos):
    next_index = pos
    index_next_list = [0]*4
    size = len(column) - 1
    if pos > 0 and pos < size:
        for i in range(4):
            index_next_list[i] = next_index
            next_index += 1
            while next_index <= size and (pd.isnull(column.at[next_index]) or np.isnan(column.at[next_index])):
                next_index += 1
    return index_next_list  

def prev_valid_index(column, pos):
    prev_index = pos
    size = len(column) -1
    if pos >= 0 and pos < size:
        prev_index -= 1
        while prev_index >= 0 and (pd.isnull(column.at[prev_index]) or np.isnan(column.at[prev_index])):
            prev_index -= 1
    return prev_index  

def next_valid_index(column, pos):
    next_index = pos
    size = len(column) -1
    if pos >= 0 and pos < size:
        next_index += 1
        while next_index <= size and (pd.isnull(column.at[next_index]) or np.isnan(column.at[next_index])):
            next_index += 1
    return next_index

def interpolating_polynomial(nearest_indexes, nearest_values):
    coeffs = np.polyfit(nearest_indexes, nearest_values, 3)
    return np.poly1d(coeffs)

def polynomial_filler(column):
    size = len(column) -1
    if column.isnull().any() and column.isna().any():
        for i, value in enumerate(column):
            if (i > 3 and i < size - 3) and (pd.isnull(value) or np.isnan(value) or (value == '')):
                nearest_values = []
                start = prev_valid_index(column, i)
                end = next_valid_index(column, i)
                gap = list(range(start + 1, end))
                nearest_indexes = neareast_four_indexes(column, start, end)
                for index in nearest_indexes:
                    nearest_values.append(column.at[index])
                polynomial = interpolating_polynomial(nearest_indexes, nearest_values)
                for x in gap:
                    column.at[x] = polynomial(x)
                plot_polynomial(polynomial, nearest_indexes[0], nearest_indexes[len(nearest_indexes) - 1], 50, gap, nearest_indexes, column.name)
    return column

def plot_polynomial(polynomial, start, end, precision, interpolated_list, nearest_indexes, column_name):
    x = np.linspace(start, end, precision)
    y = polynomial(x)

    plt.figure()

    plt.plot(x, y)

    interpolated_x = interpolated_list  
    interpolated_y = polynomial(interpolated_x)
    plt.scatter(interpolated_x, interpolated_y, color='red')
    interpolated_x = nearest_indexes  
    interpolated_y = polynomial(interpolated_x)
    plt.scatter(interpolated_x, interpolated_y, color='green')
    
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title(column_name)

    plt.show()
    
def replace_negative_with_float_zeros(x):
    return 0.0 if x < 0 else x

In [6]:
def read_csv_replace_missing_by_polynomial(file_path, skiped_columns = 0, skiped_rows = 0):
    with open(file_path, 'r') as f:
        
        df = pd.read_csv(file_path, skiprows=skiped_rows)

        i = 0
        for col in df.columns: # Elimina missing values por médias e transforma em float
            if i > skiped_columns:
                df[col] = df[col].replace('-9999', np.nan)
                df[col] = df[col].replace('-', np.nan)
                df[col] = df[col].replace('', np.nan).astype(float)
                df[col] = four_ends_filler(df[col])
                df[col] = centered_moving_average(df[col], (0, len(df[col]) - 1), 24) #2 anos
                df[col] = polynomial_filler(df[col]) 
            i += 1
        return df

In [7]:
df = pd.read_csv('deposito_poupanca.csv')
df.set_index('Estado', inplace=True)
df = df.drop(['UF','Código'], axis=1)
df = df.T
df.rename(columns={'Estado': 'UF'}, inplace=True)
df

Estado,Acre,Alagoas,Amazonas,Amapá,Bahia,Ceará,Distrito Federal,Espírito Santo,Goiás,Maranhão,...,Paraná,Rio de Janeiro,Rio Grande do Norte,Rondônia,Roraima,Rio Grande do Sul,Santa Catarina,Sergipe,São Paulo,Tocantins
2003,247158.77515,1594356.0,1284537.0,205197.602881,7782609.0,4636226.0,4896482.0,4069229.0,3935260.0,1609842.0,...,13120290.0,36752880.0,1843384.0,571416.5,137154.814617,20462810.0,8001967.0,1618045.0,86522990.0,381018.1
2004,259422.782134,1638129.0,1370642.0,208807.697157,8316430.0,4888193.0,5348840.0,4370580.0,4260107.0,1723195.0,...,13311000.0,36679420.0,1989660.0,626482.4,147387.904447,21138960.0,8436550.0,1670497.0,87347210.0,405869.4
2005,291267.223989,1644989.0,1398651.0,214497.40659,8416032.0,5100767.0,5398898.0,4452244.0,4147822.0,1959751.0,...,12830670.0,35436370.0,2042178.0,644900.6,152846.938566,20903560.0,8375574.0,1714598.0,85309590.0,437590.3
2006,320070.059633,1739302.0,1537541.0,233930.355555,9032092.0,4925766.0,5611119.0,4863575.0,4531156.0,2237789.0,...,13178500.0,35873590.0,2252854.0,708848.6,221087.202784,21439240.0,8566029.0,1811475.0,87253090.0,572965.4
2007,359397.120658,1985817.0,1819514.0,294780.37283,10667050.0,5735004.0,6904643.0,5676563.0,5752881.0,2648197.0,...,16013910.0,40774190.0,2573341.0,900588.1,275519.839741,25412550.0,10227380.0,2140601.0,101734300.0,655079.2
2008,475998.129219,2108899.0,2087573.0,411236.260883,11704290.0,6172585.0,7218711.0,5445493.0,6236123.0,3044493.0,...,16963560.0,42458690.0,2640785.0,1044762.0,471886.078343,26363000.0,13857520.0,2345188.0,108260900.0,684977.2
2009,574349.795459,2341276.0,2230064.0,389457.256906,13168880.0,6827420.0,7908179.0,5850375.0,6792006.0,3544671.0,...,18640900.0,47793940.0,2776817.0,1205506.0,414799.404824,28609320.0,14281150.0,2522125.0,119919200.0,744138.1
2010,599379.011,2950453.0,2538483.0,419624.175,15051460.0,7756349.0,8741042.0,6496175.0,7783633.0,3633747.0,...,20834450.0,51917140.0,3021951.0,1470233.0,401300.112,30199190.0,14895020.0,2892812.0,129867700.0,974969.4
2011,577513.618102,3093400.0,2618416.0,423914.006445,15640250.0,8106733.0,8741836.0,7045605.0,8235940.0,3797295.0,...,22140970.0,52933840.0,3102611.0,1559391.0,410851.764119,30217390.0,15726510.0,2929514.0,131044400.0,1038873.0
2012,618593.348202,3324983.0,2847033.0,472371.694694,17204930.0,8768239.0,9687777.0,7802161.0,9239086.0,4129860.0,...,24935680.0,57693420.0,3338127.0,1634068.0,444921.920078,33134470.0,17049200.0,3181092.0,142761800.0,1178756.0


In [8]:
def dataframe_expanded_interpolation(df):
    expanded_df = pd.DataFrame(columns=df.columns)
    for i, (_, row) in enumerate(df.iterrows()):
        if i > 0:
            expanded_df = pd.concat([expanded_df, pd.DataFrame(columns=df.columns, index=[None] * 11)])
        expanded_df = pd.concat([expanded_df, pd.DataFrame(row).T])
    expanded_df = expanded_df.reset_index(drop=True)
    expanded_df.interpolate(method='linear', axis=0, inplace=True)

    return expanded_df

def add_ano_column(df):
    new_df = df.copy()
    ano_start = 2002
    for i in range(len(new_df)):
        if i % 12 == 0:
            ano_start += 1
        new_df.at[i, 'Ano'] =  str(ano_start)

    return new_df

def add_month(df):
    new_index = [] 
    for i, index_value in enumerate(df.index):
        iteration = i % 12 + 1
        new_index.append(f'{index_value}-{iteration}')
    df.index = new_index

    return df

In [9]:
pd.set_option('display.max_rows', None)
df_mo_2003 = dataframe_expanded_interpolation(df)
df_mo_2003 = add_ano_column(df_mo_2003)
df_mo_2003 = df_mo_2003.set_index('Ano', inplace=False)
df_mo_2003 = add_month(df_mo_2003)
df_mo_2003

Estado,Acre,Alagoas,Amazonas,Amapá,Bahia,Ceará,Distrito Federal,Espírito Santo,Goiás,Maranhão,...,Paraná,Rio de Janeiro,Rio Grande do Norte,Rondônia,Roraima,Rio Grande do Sul,Santa Catarina,Sergipe,São Paulo,Tocantins
2003-1,247158.77515,1594356.0,1284537.0,205197.602881,7782609.0,4636226.0,4896482.0,4069229.0,3935260.0,1609842.0,...,13120290.0,36752880.0,1843384.0,571416.5,137154.814617,20462810.0,8001967.0,1618045.0,86522990.0,381018.1
2003-2,248180.775732,1598004.0,1291713.0,205498.444071,7827094.0,4657223.0,4934178.0,4094341.0,3962330.0,1619288.0,...,13136190.0,36746750.0,1855573.0,576005.3,138007.572103,20519160.0,8038182.0,1622416.0,86591680.0,383089.0
2003-3,249202.776314,1601652.0,1298888.0,205799.285261,7871579.0,4678221.0,4971875.0,4119454.0,3989401.0,1628734.0,...,13152080.0,36740630.0,1867763.0,580594.1,138860.329589,20575510.0,8074397.0,1626787.0,86660360.0,385160.0
2003-4,250224.776896,1605299.0,1306063.0,206100.12645,7916064.0,4699218.0,5009571.0,4144566.0,4016472.0,1638180.0,...,13167970.0,36734510.0,1879953.0,585183.0,139713.087075,20631850.0,8110613.0,1631158.0,86729050.0,387230.9
2003-5,251246.777478,1608947.0,1313239.0,206400.96764,7960549.0,4720215.0,5047268.0,4169679.0,4043542.0,1647626.0,...,13183860.0,36728390.0,1892142.0,589771.8,140565.844561,20688200.0,8146828.0,1635529.0,86797730.0,389301.9
2003-6,252268.77806,1612595.0,1320414.0,206701.80883,8005034.0,4741212.0,5084964.0,4194792.0,4070613.0,1657072.0,...,13199750.0,36722270.0,1904332.0,594360.6,141418.602046,20744540.0,8183043.0,1639900.0,86866420.0,391372.8
2003-7,253290.778642,1616242.0,1327590.0,207002.650019,8049519.0,4762209.0,5122661.0,4219904.0,4097683.0,1666518.0,...,13215650.0,36716150.0,1916522.0,598949.4,142271.359532,20800890.0,8219259.0,1644271.0,86935100.0,393443.8
2003-8,254312.779224,1619890.0,1334765.0,207303.491209,8094004.0,4783207.0,5160357.0,4245017.0,4124754.0,1675964.0,...,13231540.0,36710030.0,1928711.0,603538.3,143124.117018,20857230.0,8255474.0,1648642.0,87003790.0,395514.7
2003-9,255334.779806,1623538.0,1341940.0,207604.332399,8138489.0,4804204.0,5198054.0,4270130.0,4151824.0,1685410.0,...,13247430.0,36703910.0,1940901.0,608127.1,143976.874504,20913580.0,8291689.0,1653013.0,87072470.0,397585.6
2003-10,256356.780388,1627186.0,1349116.0,207905.173588,8182974.0,4825201.0,5235750.0,4295242.0,4178895.0,1694856.0,...,13263320.0,36697780.0,1953091.0,612715.9,144829.63199,20969920.0,8327904.0,1657384.0,87141160.0,399656.6


In [10]:
def add_suffix_to_columns(df, suffix):
    df.columns = [col + suffix for col in df.columns]

add_suffix_to_columns(df_mo_2003, " - Depósito Poupança (R$)")
df_mo_2003

Unnamed: 0,Acre - Depósito Poupança (R$),Alagoas - Depósito Poupança (R$),Amazonas - Depósito Poupança (R$),Amapá - Depósito Poupança (R$),Bahia - Depósito Poupança (R$),Ceará - Depósito Poupança (R$),Distrito Federal - Depósito Poupança (R$),Espírito Santo - Depósito Poupança (R$),Goiás - Depósito Poupança (R$),Maranhão - Depósito Poupança (R$),...,Paraná - Depósito Poupança (R$),Rio de Janeiro - Depósito Poupança (R$),Rio Grande do Norte - Depósito Poupança (R$),Rondônia - Depósito Poupança (R$),Roraima - Depósito Poupança (R$),Rio Grande do Sul - Depósito Poupança (R$),Santa Catarina - Depósito Poupança (R$),Sergipe - Depósito Poupança (R$),São Paulo - Depósito Poupança (R$),Tocantins - Depósito Poupança (R$)
2003-1,247158.77515,1594356.0,1284537.0,205197.602881,7782609.0,4636226.0,4896482.0,4069229.0,3935260.0,1609842.0,...,13120290.0,36752880.0,1843384.0,571416.5,137154.814617,20462810.0,8001967.0,1618045.0,86522990.0,381018.1
2003-2,248180.775732,1598004.0,1291713.0,205498.444071,7827094.0,4657223.0,4934178.0,4094341.0,3962330.0,1619288.0,...,13136190.0,36746750.0,1855573.0,576005.3,138007.572103,20519160.0,8038182.0,1622416.0,86591680.0,383089.0
2003-3,249202.776314,1601652.0,1298888.0,205799.285261,7871579.0,4678221.0,4971875.0,4119454.0,3989401.0,1628734.0,...,13152080.0,36740630.0,1867763.0,580594.1,138860.329589,20575510.0,8074397.0,1626787.0,86660360.0,385160.0
2003-4,250224.776896,1605299.0,1306063.0,206100.12645,7916064.0,4699218.0,5009571.0,4144566.0,4016472.0,1638180.0,...,13167970.0,36734510.0,1879953.0,585183.0,139713.087075,20631850.0,8110613.0,1631158.0,86729050.0,387230.9
2003-5,251246.777478,1608947.0,1313239.0,206400.96764,7960549.0,4720215.0,5047268.0,4169679.0,4043542.0,1647626.0,...,13183860.0,36728390.0,1892142.0,589771.8,140565.844561,20688200.0,8146828.0,1635529.0,86797730.0,389301.9
2003-6,252268.77806,1612595.0,1320414.0,206701.80883,8005034.0,4741212.0,5084964.0,4194792.0,4070613.0,1657072.0,...,13199750.0,36722270.0,1904332.0,594360.6,141418.602046,20744540.0,8183043.0,1639900.0,86866420.0,391372.8
2003-7,253290.778642,1616242.0,1327590.0,207002.650019,8049519.0,4762209.0,5122661.0,4219904.0,4097683.0,1666518.0,...,13215650.0,36716150.0,1916522.0,598949.4,142271.359532,20800890.0,8219259.0,1644271.0,86935100.0,393443.8
2003-8,254312.779224,1619890.0,1334765.0,207303.491209,8094004.0,4783207.0,5160357.0,4245017.0,4124754.0,1675964.0,...,13231540.0,36710030.0,1928711.0,603538.3,143124.117018,20857230.0,8255474.0,1648642.0,87003790.0,395514.7
2003-9,255334.779806,1623538.0,1341940.0,207604.332399,8138489.0,4804204.0,5198054.0,4270130.0,4151824.0,1685410.0,...,13247430.0,36703910.0,1940901.0,608127.1,143976.874504,20913580.0,8291689.0,1653013.0,87072470.0,397585.6
2003-10,256356.780388,1627186.0,1349116.0,207905.173588,8182974.0,4825201.0,5235750.0,4295242.0,4178895.0,1694856.0,...,13263320.0,36697780.0,1953091.0,612715.9,144829.63199,20969920.0,8327904.0,1657384.0,87141160.0,399656.6


In [11]:
for column in df_mo_2003.columns:
    file_name = f"2003_mo_dep_poup_{column}.csv"
    column_df = df_mo_2003[[column]]
    column_df.to_csv(file_name, index=True)