In [18]:
def parse_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data_start = lines.index('@data\n') + 1
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]
    data_list = []

    for line in lines[data_start:]:
        values = line.strip().split(',')
        data_dict = {attr: val if val != 'm' else None for attr, val in zip(attributes, values)}
        data_list.append(data_dict)

    return data_list


def clean(data_list):
    for entry in data_list:
        for key, value in entry.items():
            if value == 'm':
                entry[key] = None

    return data_list


def linear_interp(data_list):
    for i in range(1, len(data_list) - 1):
        for key, value in data_list[i].items():
            if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None:
                data_list[i][key] = (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2

    return data_list


def z_score_standardize(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] is not None and row[i] != 'm']

        if len(set(column)) == 1:
            continue

        mean_val = sum(column) / len(column)

        if len(column) > 1:
            std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5
        else:
            std_dev = 0

        for row in matrix:
            if row[i] is not None and row[i] != 'm':
                if std_dev != 0:
                    row[i] = (float(row[i]) - mean_val) / std_dev
                else:
                    row[i] = 0

    return matrix


def rng(initial_seed=1, multiplier=1664525, increment=1013904223, modulus=2**32):
    seed = initial_seed
    while True:
        seed = (seed * multiplier + increment) % modulus
        yield seed / modulus


def perform_pca(data_matrix, num_components):
    cov_matrix = calculate_covariance_matrix(data_matrix)

    num_features = len(data_matrix[0])
    eigenvalues = [0] * num_features
    eigenvectors = [[0] * num_features for _ in range(num_features)]

    random_generator = rng()

    for i in range(num_features):
        vector = [next(random_generator) for _ in range(num_features)]

        for _ in range(1000):
            new_vector = matrix_vector_multiply(cov_matrix, vector)
            magnitude = sum(x ** 2 for x in new_vector) ** 0.5
            vector = scale(new_vector, 1 / magnitude)

        eigenvalues[i] = dot_product(new_vector, vector)
        eigenvectors[i] = vector

    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [[eigenvectors[j][i] for j in sorted_indices] for i in range(num_features)]
    top_eigenvectors = eigenvectors[:num_components]
    pca_result = matrix_multiply(data_matrix, transpose(top_eigenvectors))

    return pca_result


def print_table(data_list):
    attributes = list(data_list[0].keys())
    column_widths = {attr: max(len(attr), max(len(str(entry[attr])) for entry in data_list)) for attr in attributes}

    header = "|".join(f"{attr:^{column_widths[attr]}}" for attr in attributes)
    print(header)
    print("-" * sum(column_widths.values()))

    for entry in data_list:
        row = "|".join(f"{str(entry[attr]):^{column_widths[attr]}}" if entry[attr] is not None else 'm' for attr in attributes)
        print(row)


def print_matrix(matrix):
    for row in matrix:
        formatted_row = "|".join(f"{float(cell):^14.4f}" if cell is not None and cell != 'm' else 'm' for cell in row)
        print(formatted_row)


file_paths = [
    r'.\V4 data\2017.arff',
    r'.\V4 data\2018.arff',
    r'.\V4 data\2019.arff',
    r'.\V4 data\2020.arff',
    r'.\V4 data\2021 Q1.arff'
]

data_list = [clean(linear_interp(parse_data(file_path))) for file_path in file_paths]

data_2017_preprocessed, data_2018_preprocessed, data_2019_preprocessed, data_2020_preprocessed, data_2021_preprocessed = data_list



combined_data = data_2017_processed + data_2018_processed + data_2019_processed + data_2020_processed + data_2021_processed
print(type(combined_data))
attributes_combined = list(combined_data[0].keys())

matrix_combined = []

for entry in combined_data:
    row_combined = [entry[attr_combined] for attr_combined in attributes_combined[2:]]
    matrix_combined.append(row_combined)

standardized_data_combined = z_score_standardize(matrix_combined)
print_matrix(standardized_data_combined)
num_components_pca = 2
pca_result_combined = perform_pca(standardized_data_combined, num_components_pca)

print("\n\nPCA Implementation:")
print_matrix(pca_result_combined)


<class 'list'>
    0.1400    |    0.5300    |    0.0033    |   -0.0625    |   -0.0209    |    0.0227    |   -0.0694    |   -0.0215    |   -0.0215    |    0.0227    |    0.0221    |    0.0212    |    0.0709    |    0.0217    |   -0.0673    |    0.0224    |    0.0234    |    0.0227    |    0.0088    |   -0.0215    |    0.0085    |   -0.0280    |    0.0105    |    1.3848    |   -0.0309    |   -0.0339    |   -0.0248    |   -0.0220    |   -0.5009    |   -0.0215    |   -0.0799    |   -0.0215    |   -0.8151    |   -0.0132    |    0.0275    |    0.0510    |   -0.0453    |   -0.0212    |    0.0242    |   -0.0508    |   -0.0227    |    0.0119    |    0.0110    |    2.1530    |   -0.0278    |   -0.0781    |    0.1673    |    0.3745    |   -0.0613    |   -0.0588    |    0.0242    |m|m|   -0.0209    |    0.0006    |    0.0208    |   -0.0099    |    0.0207    |m|   -0.0403    |   -0.0412    |m|m|m|m|   -0.0290    |   -0.0448    |   -0.0538    |   -0.0354    |   -0.0215    |   -0.1269    |   -0.0249 

In [19]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

def load_arff(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    data_start = lines.index('@data\n') + 1


    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]

    data_list = []

    for line in lines[data_start:]:
        values = line.strip().split(',')
        data_dict = {attr: val if val != 'm' else None for attr, val in zip(attributes, values)}
        data_list.append(data_dict)

    return data_list


def preprocess_data(data_list):
    for entry in data_list:
        for key, value in entry.items():
            if value == 'm':
                entry[key] = None

    return data_list


def linear_interpolation(data_list):
    for i in range(1, len(data_list) - 1):
        for key, value in data_list[i].items():
            if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None:
                data_list[i][key] = (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2

    return data_list


def z_score_standardization(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] is not None and row[i] != 'm']

        if len(set(column)) == 1:
            continue

        mean_val = sum(column) / len(column)

        if len(column) > 1:
            std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5
        else:
            std_dev = 0

        for row in matrix:
            if row[i] is not None and row[i] != 'm':
                if std_dev != 0:
                    row[i] = (float(row[i]) - mean_val) / std_dev
                else:
                    row[i] = 0 

    return matrix


def custom_random(initial_seed=1, multiplier=1664525, increment=1013904223, modulus=2**32):
    seed = initial_seed
    while True:
        seed = (seed * multiplier + increment) % modulus
        yield seed / modulus


def svd_sklearn(data_matrix, num_components):
    imputer = SimpleImputer(strategy='mean')
    data_matrix_imputed = imputer.fit_transform(data_matrix)

    scaler = StandardScaler()
    data_matrix_standardized = scaler.fit_transform(data_matrix_imputed)

    svd = TruncatedSVD(n_components=num_components)
    svd_result = svd.fit_transform(data_matrix_standardized)

    return svd_result


def pca_sklearn_with_imputation(data_matrix, num_components):
    imputer = SimpleImputer(strategy='mean')
    data_matrix_imputed = imputer.fit_transform(data_matrix)

    scaler = StandardScaler()
    data_matrix_standardized = scaler.fit_transform(data_matrix_imputed)

    pca = PCA(n_components=num_components)
    pca_result = pca.fit_transform(data_matrix_standardized)

    return pca_result


def display_matrix(matrix):
    for row in matrix:
        print("|".join(f"{str(cell):^10}" for cell in row))


file_path_2017 = r'.\V4 data\2017.arff'
file_path_2018 = r'.\V4 data\2018.arff'
file_path_2019 = r'.\V4 data\2019.arff'
file_path_2020 = r'.\V4 data\2020.arff'
file_path_2021 = r'.\V4 data\2021 Q1.arff'

data_2017 = load_arff(file_path_2017)
data_2018 = load_arff(file_path_2018)
data_2019 = load_arff(file_path_2019)
data_2020 = load_arff(file_path_2020)
data_2021 = load_arff(file_path_2021)

data_2017_preprocessed = preprocess_data(data_2017)
data_2018_preprocessed = preprocess_data(data_2018)
data_2019_preprocessed = preprocess_data(data_2019)
data_2020_preprocessed = preprocess_data(data_2020)
data_2021_preprocessed = preprocess_data(data_2021)

data_2017_preprocessed = linear_interpolation(data_2017_preprocessed)
data_2018_preprocessed = linear_interpolation(data_2018_preprocessed)
data_2019_preprocessed = linear_interpolation(data_2019_preprocessed)
data_2020_preprocessed = linear_interpolation(data_2020_preprocessed)
data_2021_preprocessed = linear_interpolation(data_2021_preprocessed)


data_combined = data_2017_preprocessed + data_2018_preprocessed + data_2019_preprocessed + data_2020_preprocessed + data_2021_preprocessed
display_data_table(data_combined)


attributes = list(data_combined[0].keys())


matrix = []
for entry in data_combined:
    row = [entry[attr] for attr in attributes[2:]] 
    matrix.append(row)


standardized_data = z_score_standardization(matrix)

num_components = 2
pca_result_sklearn = pca_sklearn_with_imputation(standardized_data, num_components)


print("\nScikit-learn PCA Result:")
display_matrix(pca_result_sklearn)

svd_result_sklearn = svd_sklearn(standardized_data, num_components)

print("\nScikit-learn SVD Result:")
display_matrix(svd_result_sklearn)

Num|    Country     |         X1          |        X2         |         X3          |        X4        |         X5          |         X6          |         X7          |        X8         |         X9          |         X10          |         X11          |         X12          |         X13         |         X14         |       X15        |         X16          |         X17          |         X18          |         X19          |        X20        |         X21          |        X22        |         X23         |       X24        |         X25         |         X26         |        X27        |        X28        |        X29         |        X30        |        X31        |         X32         |       X33        |        X34         |         X35         |         X36          |        X37        |         X38         |         X39          |        X40        |        X41        |        X42         |        X43        |       X44        |         X45         |         X46         

In comparing my implementation to the scikit-learn code, it's pretty evident that there are some noticeable differences in the results. One major factor is how we deal with missing values. I went with linear interpolation in my code, while scikit-learn prefers a mean imputation strategy (using SimpleImputer(strategy='mean')) during z-score standardization. This distinction in handling missing data is likely causing variations in the standardized datasets, leading to different PCA results.

Another point of contrast lies in how we generate random numbers for eigenvectors. I opted for a custom random number generator (rng), whereas scikit-learn might be doing something different or using a different seed for its internal random number generation. This dissimilarity in generating random vectors for PCA is likely contributing to the differences in the calculated eigenvectors and, consequently, the principal components.

Additionally, our approaches to eigenvalue and eigenvector calculations differ. I took a somewhat unconventional route involving random vectors, while scikit-learn sticks to a more standardized approach. This difference in the underlying calculation process is probably a factor in the observed discrepancies in the final PCA results.

It's worth noting that due to these implementation disparities, directly comparing the singular value decomposition (SVD) wasn't straightforward. My code doesn't have a dedicated SVD implementation, and scikit-learn uses Truncated SVD (TruncatedSVD) with a mean imputation strategy. The lack of a consistent SVD approach further contributes to the divergent results.