In [76]:
# The results may be similar or dissimilar based on various factors:
# - The custom power iteration method used to calculate eigenvectors might yield slightly different results
#   than the numerical methods employed by standard libraries. The number of iterations and the initial 
#   approximation can affect the accuracy.
# - Differences in the z-score calculation might lead to variations in the results.
# - Small variations in floating-point precision during calculations can lead to differences in the final 
#   output.

def parse_arff(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Extract attribute names
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

    # Extract data instances
    data_start = lines.index('@data\n') + 1
    data_list = [{attr: val if val != 'm' else None for attr, val in zip(attributes, line.strip().split(','))} for line in lines[data_start:]]
    
    return data_list

def lerp(data_list):
    for i in range(1, len(data_list) - 1):
        for key, value in data_list[i].items():
            if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None:
                data_list[i][key] = (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2

    return data_list

def z_score(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] is not None]

        if len(set(column)) == 1:
            continue

        mean_val = sum(column) / len(column)

        if len(column) > 1:
            std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5
        else:
            std_dev = 0

        for row in matrix:
            if row[i] is not None:
                if std_dev != 0:
                    row[i] = (float(row[i]) - mean_val) / std_dev
                else:
                    row[i] = 0  

    return matrix

def dot_product(v1, v2):
    result = sum(x * y for x, y in zip(v1, v2) if isinstance(x, (int, float)) and isinstance(y, (int, float)))
    return result

def subtract(v1, v2):
    return [x - y for x, y in zip(v1, v2)]

def scale(vector, scalar):
    return [x * scalar for x in vector]

def multiply_matrix_vector(matrix, vector):
    return [dot_product(row, vector) for row in matrix]

def multiply_matrix(matrix1, matrix2):
    result = []
    for row in matrix1:
        new_row = []
        for col in transpose(matrix2):
            element = dot_product(row, col)
            new_row.append(element)
        result.append(new_row)
    return result

def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mean(column):
    values = [float(val) for val in column if val is not None]
    return sum(values) / len(values) if values else 0

def covariance_matrix(matrix):
    n = len(matrix)
    num_features = len(matrix[0])
    transposed_matrix = transpose(matrix)
    cov_matrix = [[0] * num_features for _ in range(num_features)]

    for i in range(num_features):
        for j in range(num_features):
            mean_i = mean(matrix[i])
            mean_j = mean(matrix[j])
            values_i = [float(val) for val in matrix[i] if val is not None]
            values_j = [float(val) for val in matrix[j] if val is not None]
            cov_matrix[i][j] = sum((val_i - mean_i) * (val_j - mean_j) for val_i, val_j in zip(values_i, values_j)) / (n - 1)

    return cov_matrix

def custom_random():
    seed = 1
    while True:
        seed = (seed * 1103515245 + 12345) & 0x7FFFFFFF
        yield seed / 0x7FFFFFFF


# Function to perform PCA
def pca(data_matrix, num_components):
    # Calculate the covariance matrix
    cov_matrix = covariance_matrix(data_matrix)

    # Calculate the eigenvalues and eigenvectors using power iteration
    num_features = len(data_matrix[0])
    eigenvalues = [0] * num_features
    eigenvectors = [[0] * num_features for _ in range(num_features)]

    random_generator = custom_random()

    for i in range(num_features):
        # Use a simple random number generator as the initial approximation
        vector = [next(random_generator) for _ in range(num_features)]

        for _ in range(1000):  # Adjust the number of iterations as needed
            new_vector = multiply_matrix_vector(cov_matrix, vector)
            magnitude = sum(x ** 2 for x in new_vector) ** 0.5
            vector = scale(new_vector, 1 / magnitude)

        eigenvalues[i] = dot_product(new_vector, vector)
        eigenvectors[i] = vector

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [[eigenvectors[j][i] for j in sorted_indices] for i in range(num_features)]

    # Select the top 'num_components' eigenvectors
    top_eigenvectors = eigenvectors[:num_components]

    # Project the data onto the new subspace defined by the top eigenvectors
    pca_result = multiply_matrix(data_matrix, transpose(top_eigenvectors))

    return pca_result

def display_matrix(matrix):
    for row in matrix:
        print("|".join(f"{cell * 100:^10}" for cell in row))

DATA_NAMES = [
    "2017",
    "2018",
    "2019",
    "2020",
    "2021 Q1",
]

data = sum([lerp(parse_arff(f'./V4 data/{DATA_NAME}.arff')) for DATA_NAME in DATA_NAMES], [])
attributes = list(data[0].keys())
matrix = pca(z_score([[entry[attr] for attr in attributes[2:]] for entry in data]), 2)
display_matrix(matrix)

0.00042434359187306633|0.005910242805800577
0.007918140301737474|0.11028358304433208
0.008814048012138092|0.1227617545107234
0.016238141362970887|0.22616426878617554
0.036145633903043095|0.5034351332927974
0.036145633903043095|0.5034351332927974
0.012101090906881|0.16854357375597173
0.002252509954044918|0.031372880387152
0.0061000193116031035|0.08496085705573984
-0.007579213610342429|-0.10556302386098386
0.017753745728384732|0.24727355373516707
0.00840928879292727|0.11712428216699634
0.036145633903043095|0.5034351332927974
0.036145633903043095|0.5034351332927974
0.012030204864644884|0.16755627542230678
0.01264059520588719|0.17605777089000935
0.013272200868134349|0.18485475261164164
-0.006435591696001708|-0.08963469756777111
-0.003975179686518455|-0.05536616457504582
-0.005170916129449089|-0.07202034021198656
0.10314222218469943|1.4365612874006866
0.0019364235834675616|0.02697044039867384
0.005601928193381652|0.0780234612649673
0.008829204919763287|0.12297285939357216
0.0133778369392878