In [2]:
import pandas as pd

def weighted_slope(start_value, end_value, start_index, end_index):

    distance = end_index - start_index
    weight_start = distance / (start_index - 0.5)  # Assign a higher weight to the starting point
    weight_end = distance / (end_index - 0.5)    # Assign a higher weight to the ending point

    # Calculate the slope with weighted contributions
    slope = (end_value - start_value) / (end_index - start_index)
    weighted_slope = slope * ((weight_start + weight_end) / 2)

    #weighted_slope = (end_value - start_value) / (end_index - start_index)
    
    return round(weighted_slope)


def consecutive_extrapolation(buurt_data, start_index, end_index):

    # Make sure right data type, improve 
    buurt_values = buurt_data.astype('Int64')

    smal_known_i = buurt_values.first_valid_index()
    larg_known_i = buurt_values.last_valid_index()

    
    # Compute slope between first and last known point
    w_slope = weighted_slope(buurt_values.iloc[smal_known_i], buurt_values.iloc[larg_known_i], smal_known_i, larg_known_i)

    # When the consecutive missing values occur at the beginning, begin iterating at the end.
    if start_index == 0:
        loop_start = len(buurt_values) - 1
        loop_end = start_index - 1
        direction = -1
    
    # When the consecutive missing values occur at the end or in between, begin iterating at the start.
    else:
        loop_start = 0
        loop_end = len(buurt_values)
        direction = 1
        

    for i in range(loop_start, loop_end, direction):

        valid_indices = buurt_values[buurt_values.notna()].index

        if pd.isna(buurt_values.iloc[i]):  # Only extrapolate missing values
            
            prev_index = valid_indices[valid_indices < i]
            next_index = valid_indices[valid_indices > i]

            if not prev_index.empty and not next_index.empty: 

                prev_known_value = buurt_values.iloc[prev_index.max()] 
                next_known_value = buurt_values.iloc[next_index.min()]

                slope = weighted_slope(prev_known_value, next_known_value, prev_index.max(), next_index.min())

                if start_index == 0:
                    buurt_values.iloc[i] = buurt_values.iloc[i + 1] - slope
                    
                else: 
                    buurt_values.iloc[i] = buurt_values.iloc[i - 1] + slope

                # print(prev_known_value, next_known_value)
                # print(buurt_values.iloc[i], slope)

            
            elif prev_index.empty and not next_index.empty: 
                next_known_value = buurt_values.iloc[next_index.min()]
                

                
                second_known_index = next_index[1]                
                second_known_value = buurt_values.iloc[second_known_index]

                slope = weighted_slope(next_known_value, second_known_value, next_index.min(), second_known_index)
                
                buurt_values.iloc[i] = buurt_values.iloc[i+1] - slope



            elif not prev_index.empty and next_index.empty: 
                prev_known_value = buurt_values.iloc[prev_index.max()]

                
                second_known_index = prev_index[len(prev_index)-2]
                second_known_value = buurt_values.iloc[second_known_index]


                slope = weighted_slope(second_known_value, prev_known_value, second_known_index, prev_index.max())

                buurt_values.iloc[i] = buurt_values.iloc[i-1] + slope 
                

    return buurt_values


# # verspreide huizen veendam [203, 205, 205, 211, 254, 276, 278, 292, 327, 353]
# data1 = pd.Series([203, pd.NA, 205, 211, pd.NA, pd.NA, pd.NA, 292, 327, 353])


# Bornsche maten [282, 297, 291, 307, 340, 356, 382, 423, 496, 529]
#data2 = pd.Series([pd.NA, 297, pd.NA, pd.NA, 340, 356, 382, pd.NA, pd.NA, pd.NA])

data3 = pd.Series([pd.NA, pd.NA, pd.NA, pd.NA, 135, 140, 148, 159, 194, 203])



# consecutive_extrapolation(data1, 4, 6)


# result = consecutive_extrapolation(data2, 7, 9)

result = consecutive_extrapolation(data3, 0, 4)

print(list(result))




[134, 134, 134, 134, 135, 140, 148, 159, 194, 203]


In [1]:
### import matplotlib.pyplot as plt


### Print linearity per colomen
####
from itertools import islice
from scipy.stats import linregress


def linearity(df):

    columns = df.columns

    for col in columns[5:]:
        buurten = df.groupby('gwb_code')[col]

        print(f'Processing column: {col}')

        
        num_rows = 52960
        missing_count = df[col].isnull().sum()        
        if missing_count > 0:
            proc = round((missing_count/num_rows) * 100, 2)

            if proc > 3:
                #print(f'Processing column: {col}')
                buurt_count = 0
                r_ = 0

            else:
                continue

        else:
            continue
        
        #for buurt in islice(buurten, 8000):
        for buurt in buurten:
            
            # buurt is a tuple: (group_name, group_data)
            buurt_code, buurt_data = buurt

            if buurt_data.isnull().any():
                
                # identife x and y, for linregression
                x = buurt_data.dropna().index
                y = buurt_data.dropna()

                if x.empty or y.empty:
                    r_ =+ 0
                    buurt_count =+ 1
                    

                else:   
                    slope, intercept, r_value, p_value, std_err = linregress(x, y)
                    r_sqrt = r_value**2 
                        
                    r_ =+ r_sqrt 
                    buurt_count =+ 1

        
        r_mean = r_ / buurt_count
        print(f'For column: {col} --> r_value in missing columns: {r_mean}')
                


In [None]:
def spline_interpolation(buurt_data, missing):

    missing_index = buurt_data[buurt_data.isna()].index
    print(missing_index)
    buurt_values = buurt_data.astype('Int64')    
    
    buurt_values_P = buurt_values.interpolate(method='polynomial', order=2)
    pred_P = buurt_values_P.iloc[missing_index]

    buurt_values_S = buurt_values.interpolate(method='spline', order=2)
    pred_S = buurt_values_S.iloc[missing_index]
    #print(list(pred_S)[0])

    # acc_P = (1 - np.abs(list(pred_P)[0] - missing) / missing) * 100
    # acc_S = (1 - np.abs(list(pred_S)[0] - missing) / missing) * 100


    mae_P = np.mean(np.abs(pred_P - missing))  # Mean Absolute Error for Polynomial
    mae_S = np.mean(np.abs(pred_S - missing))  # Mean Absolute Error for Spline

    # print(pred_P, missing)
    
    # print(mae_P)
    # print(mae_S)
    # print(f"RealValue {missing} --- accuracy Poly {pred_P.tolist()} with {acc_P},    and Spline {pred_S.tolist()} with {acc_S}")
    
    #return buurt_values_P#, buurt_values_S = buurt_values.interpolate(method='spline', order=2)



#data1 = pd.Series([408, 547, 598, 560, 577, 632, pd.NA, 479, 355, 378])


data2 = pd.Series([373, 355, 375, pd.NA, pd.NA, 295, 294, 329, 393, 404]) # missing = 389, 304
data3 = pd.Series([217, 217, 227, pd.NA, 188, 202, 224, 268, 278]) # 176
data4 = pd.Series([555, 524, 504, 519, pd.NA, 525, 525, 561, 644, 637]) # 506



spline_interpolation(data2, [389, 304])
# spline_interpolation(data3, 176)
# spline_interpolation(data4, 506)




def spline_interpolation2(buurt_data):


    buurt_values = buurt_data.astype('Int64')    
    
    buurt_values_P = buurt_values.interpolate(method='polynomial', order=2)

    buurt_values_S = buurt_values.interpolate(method='spline', order=2)

    return buurt_values_P, buurt_values_S

data2 = pd.Series([373, 355, 375, pd.NA, pd.NA, 295, 294, 329, 393, 404]) # missing = 389, 304
data3 = pd.Series([217, 217, 227, pd.NA, 188, 202, 224, 268, 278]) # 176
data4 = pd.Series([555, 524, 504, 519, pd.NA, 525, 525, 561, 644, 637]) # 506


data5 = pd.Series([pd.NA, 524, 504, 519, pd.NA, 525, 525, pd.NA, pd.NA, pd.NA]) # 506

buurt_values_P, buurt_values_S = spline_interpolation2(data5)
print(buurt_values_P)
print(buurt_values_S)

# spline_interpolation(data3, 176)
# spline_interpolation(data4, 506)





In [None]:
def weighted_slope(start_value, end_value, start_index, end_index):
    # Adjust weights to favor nearer points
    weight_start = 1 / (start_index + 1)  # Higher weight for closer points
    weight_end = 1 / (end_index + 1)     # Higher weight for closer points

    # Calculate weighted slope
    slope = (end_value - start_value) / (end_index - start_index)
    weighted_slope = slope * ((weight_start + weight_end) / 2)
    
    return weighted_slope

def consecutive_extrapolation(buurt_data):
    buurt_values = buurt_data.astype('Float64')  # Ensure float for calculations
    
    # Indices of known and missing values
    known_indices = buurt_values[buurt_values.notna()].index
    missing_indices = buurt_values[buurt_values.isna()].index

    for i in missing_indices:
        # Identify closest known indices before and after
        prev_known_indexes = known_indices[known_indices < i]
        next_known_indexes = known_indices[known_indices > i]

        if not prev_known_indexes.empty and not next_known_indexes.empty:
            # Interpolate between two known values
            prev_known_index = prev_known_indexes.max()
            prev_value = buurt_values.iloc[prev_known_index]
            
            next_known_index = next_known_indexes.min()
            next_value = buurt_values.iloc[next_known_index]

            print(prev_value, next_value)
            slope = weighted_slope(prev_value, next_value, prev_known_index, next_known_index)
            buurt_values.iloc[i] = prev_value + slope * (i - prev_known_index)

        elif not prev_known_indexes.empty and next_known_indexes.empty:
            # Extrapolate based on previous known value

            prev_known_index = prev_known_indexes.max()
            prev_value = buurt_values.iloc[prev_known_index]
            
            second_prev_index = known_indices[known_indices < prev_known_index] if any(known_indices < prev_known_index) else None
            
            if second_prev_index is not None:
                second_prev_value = buurt_values.iloc[second_prev_index]
                slope = weighted_slope(second_prev_value, prev_value, second_prev_index, prev_known_index)
                buurt_values.iloc[i] = prev_value + slope * (i - prev_known_index)

        elif prev_known_indexes.empty and not next_known_indexes.empty:
            # Extrapolate based on next known value

            next_known_index = next_known_indexes.min()
            next_value = buurt_values.iloc[next_known_index]

            if next_known_index < len(buurt_values):  # Ensure next_known_index is within bounds

                second_next_index = known_indices[known_indices > next_known_index]if any(known_indices > next_known_index) else None
                
                if second_next_index is not None:
                    second_next_value = buurt_values.iloc[second_next_index]
                    slope = weighted_slope(next_value, second_next_value, next_known_index, second_next_index)
                    buurt_values.iloc[i] = next_value - slope * (next_known_index - i)

    return buurt_values

# data1 = pd.Series([pd.NA, pd.NA, pd.NA, 308, pd.NA, 421, 479, pd.NA, pd.NA, 605]) # 479
# data2 = pd.Series([pd.NA, pd.NA, pd.NA, 308, pd.NA, 421, 479, pd.NA, pd.NA, pd.NA]) # 479

# consecutive_extrapolation(data1)


data3 = pd.Series([87, 87, 88, 94, 103, <NA>, <NA>, 163, 177, 181]])


consecutive_extrapolation(data3)

In [None]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from numpy.lib.stride_tricks import sliding_window_view

def mice(buurt_data, win_size=3):
    # Convert the input data to a DataFrame
    train = pd.DataFrame(buurt_data, columns=['y'])
    
    # Create rolling windows
    X = sliding_window_view(train['y'].values, win_size)
    
    # Add month info (assuming the index is a datetime index)
    # Here we create a mock index for demonstration purposes
    train.index = pd.date_range(start='2023-01-01', periods=len(train), freq='M')
    month_info = train.index.month.values[:len(X)]
    
    # Add month info to the windows
    X = np.concatenate((X, month_info.reshape(-1, 1)), axis=1)

    # Impute missing values using IterativeImputer
    imp = IterativeImputer(random_state=0)
    X_imp = imp.fit_transform(X)

    # Extract the imputed values
    matrix = X_imp[:, :win_size]
    
    # Collect diagonals for mean and std calculation
    diags = [matrix[::-1, :].diagonal(i) for i in range(-matrix.shape[0] + 1, matrix.shape[1])]

    # Prepare the DataFrame to store results
    train['y_mean'] = np.nan
    train['y_std'] = np.nan

    # Calculate mean and std for each diagonal
    for i, v in enumerate(diags):
        if len(v) > 0:  # Ensure there are values to calculate mean/std
            train.iloc[i, 1] = np.mean(v)
            train.iloc[i, 2] = np.std(v)

    return train

# Example usage
data1 = pd.Series([pd.NA, pd.NA, pd.NA, 308, 393, 421, 479, pd.NA, 598, 605])  # Example data
result = mice(data1)
print(result)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

def ridge_regression_impute(data, degree=2, alpha=1.0):
    # Prepare the data
    years = np.arange(len(data)).reshape(-1, 1)  # Create an array of indices (0, 1, 2, ...)
    observations = data.values.astype(float)  # Convert to float for regression

    # Create a mask for missing values
    mask = ~np.isnan(observations)

    # Fit a polynomial regression model only on non-missing values
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(years[mask])
    model = Ridge(alpha=alpha)
    model.fit(X_poly, observations[mask])

    # Predict missing values
    X_full_poly = poly.transform(years)
    predictions = model.predict(X_full_poly)

    # Create a new Series to hold the results
    imputed_data = pd.Series(observations, index=data.index)
    
    # Fill in the missing values with predictions
    imputed_data[~mask] = predictions[~mask]

    return imputed_data

# real values [299, 310, 315, 313, 217, 226, 233, 258, 316, 337]
data1 = pd.Series([299, np.nan, 315, np.nan, 217, np.nan, np.nan, 258, 316, 337])  
imputed_values = ridge_regression_impute(data1)

print(imputed_values)

In [None]:
            # elif buurt_data.isnull().all() == 0:
            #     #print(buurt_data)
                
            #     # identife x and y, for linregression
            #     x = buurt_data.dropna().index
            #     y = buurt_data.dropna()


                  
            #     slope, intercept, r_value, p_value, std_err = linregress(x, y)
            #     r_sqrt = r_value**2 

            #     if r_sqrt < 0.4:
            #         print(f'{buurt_code}, met R-squared: {r_sqrt}')
            #         print(list(buurt_data))



In [6]:
import pandas as pd


def linear_interpolation(buurt_data):
    buurt_values = buurt_data.astype(float)  # Use float to handle NaN values

    while buurt_values.isnull().sum() > 0:
        # Extrapolation: when there is a missing value at the beginning
        if pd.isna(buurt_values.iloc[0]) and pd.notna(buurt_values.iloc[-1]):
            if len(buurt_values) > 2 and pd.isna(buurt_values.iloc[2]):
                buurt_values = buurt_values.interpolate(method='linear', limit_direction='forward')
            if len(buurt_values) > 1:
                slope = buurt_values.iloc[1] - buurt_values.iloc[0]
                buurt_values.iloc[0] = buurt_values.iloc[1] - slope

        # Extrapolation: when there is a missing value at the end
        elif pd.isna(buurt_values.iloc[-1]) and pd.notna(buurt_values.iloc[0]):
            if len(buurt_values) > 2 and pd.isna(buurt_values.iloc[-3]):
                buurt_values = buurt_values.interpolate(method='linear', limit_direction='backward')
            if len(buurt_values) > 1:
                slope = buurt_values.iloc[-2] - buurt_values.iloc[-3]
                buurt_values.iloc[-1] = buurt_values.iloc[-2] + slope

        # Extrapolation: when there are missing values at both ends
        elif pd.isna(buurt_values.iloc[0]) and pd.isna(buurt_values.iloc[-1]):
            if len(buurt_values) > 2 and (pd.isna(buurt_values.iloc[2]) or pd.isna(buurt_values.iloc[-3])):
                interpolated_subset = buurt_values.iloc[1:-1].interpolate(method='linear', limit_direction='forward')
                buurt_values.iloc[1:-1] = interpolated_subset
            if len(buurt_values) > 2:
                slope1 = (buurt_values.iloc[2] - buurt_values.iloc[1])
                buurt_values.iloc[0] = buurt_values.iloc[1] - slope1
                slope2 = (buurt_values.iloc[-2] - buurt_values.iloc[-3])
                buurt_values.iloc[-1] = buurt_values.iloc[-2] + slope2

        # Otherwise, just interpolate
        else:
            buurt_values = buurt_values.interpolate(method='linear', limit_direction='both')

    return buurt_values


data1 = pd.Series([pd.NA, 2200, 2270, 2270, 2310, 2510, 2860, 3010, 3050, pd.NA])
result = linear_interpolation(data1)
print(result)

TypeError: float() argument must be a string or a real number, not 'NAType'