In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

### Fuction: Compute Cosine Dissimilarity and RMSE

In [None]:
def calculate_metrics(df_imputated, df_original):
    # Ensure numeric columns only
    if not all(df_imputated.dtypes.apply(lambda x: np.issubdtype(x, np.number))) or \
       not all(df_original.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
        raise ValueError("All columns must be numeric.")
    
    column_metrics = {}
    rmse_values = []

    for col in df_imputated.columns:
        col1 = df_imputated[col].to_numpy()
        col2 = df_original[col].to_numpy()
        
        # Compute cosine dissimilarity
        cos_dissimilarity = cosine(col1, col2)
        
        # Compute RMSE for the column
        rmse = np.sqrt(np.mean((col1 - col2) ** 2))
        rmse_values.append(rmse)
        
        # Store the metrics for the column
        column_metrics[col] = {
            'Cosine Dissimilarity': cos_dissimilarity,
            'RMSE': rmse
        }
    
    # Calculate total dissimilarity (average across columns)
    total_dissimilarity = np.mean([metrics['Cosine Dissimilarity'] for metrics in column_metrics.values()])
    
    # Calculate total RMSE (average RMSE across all columns)
    total_rmse = np.mean(rmse_values)

    # Add overall metrics to the result
    column_metrics['Total'] = {
        'Cosine Dissimilarity': total_dissimilarity,
        'RMSE': total_rmse
    }

    # Convert results to a DataFrame
    df_result = pd.DataFrame(column_metrics).T
    return df_result

# Example usage:
# df_imputated = pd.read_csv("imputed_data.csv")
# df_original = pd.read_csv("original

### 10% Random GAN

In [81]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/10% Random/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.001922,7.86969
Latitude,0.02082,9.373923
Temperature,0.000508,9.874765
Wind speed,0.810617,2.966301
Evaporatio,0.939354,10.212889
Radiant quantity,0.081122,5.917499
Precipitation,0.040795,637.553016
vegetation coverage,0.382745,4.957103
Total,0.284735,86.090648


### 10% Horizontal GAN

In [82]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/10% Horizontal/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.00271,8.835493
Latitude,0.014607,6.893556
Temperature,0.002661,21.984842
Wind speed,0.751277,5.328382
Evaporatio,0.556163,4.811254
Radiant quantity,0.012584,1.883661
Precipitation,0.014699,386.156088
vegetation coverage,0.935823,5.703166
Total,0.286316,55.199555


### 20% Random GAN

In [83]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/20% Random/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.002013,8.113691
Latitude,0.02137,8.512784
Temperature,0.00247,23.598292
Wind speed,1.071153,8.288655
Evaporatio,0.397254,10.325968
Radiant quantity,0.575143,13.791786
Precipitation,0.064477,796.616079
vegetation coverage,1.277404,11.07304
Total,0.42641,110.040037


### 20% Horizontal GAN

In [84]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/20% Horizontal/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.000922,5.47336
Latitude,0.001705,2.692777
Temperature,0.015509,53.465474
Wind speed,1.065411,8.267014
Evaporatio,0.053475,1.424297
Radiant quantity,0.078109,6.368617
Precipitation,0.032594,573.433138
vegetation coverage,1.167618,7.588017
Total,0.301918,82.339087


### 50% Random GAN

In [85]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/50% Random/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.000193,3.253319
Latitude,0.006087,7.074635
Temperature,0.014528,60.502628
Wind speed,1.301363,16.982947
Evaporatio,0.305459,15.355674
Radiant quantity,0.122924,14.002663
Precipitation,0.195333,1340.955954
vegetation coverage,0.245896,14.171875
Total,0.273973,184.037462


### 50% Horizontal GAN

In [86]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/models/50% Horizontal/GAN/completed_test_data.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.233086,74.964788
Latitude,1.109309,117.123489
Temperature,0.243063,191.972595
Wind speed,0.675645,73.327404
Evaporatio,0.719915,156.851822
Radiant quantity,0.614404,12.989256
Precipitation,0.285044,1576.607902
vegetation coverage,0.728535,145.201013
Total,0.576125,293.629784


### Iteration SVD 10%

In [87]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/svd/drop 10/complete_10_4k.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,3.731375e-07,0.100667
Latitude,9.982901e-07,0.056559
Temperature,3.351975e-07,0.239367
Wind speed,0.03230832,0.217241
Evaporatio,0.005376859,0.371477
Radiant quantity,0.0035233,0.989433
Precipitation,0.02935095,542.423933
vegetation coverage,0.0006814278,0.076423
Total,0.00890532,68.059388


### Iteration SVD 20%

In [88]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/svd/drop 20/complete_20_3k.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,7.08719e-07,0.138693
Latitude,2.206528e-06,0.084097
Temperature,5.508426e-07,0.306736
Wind speed,0.05960714,0.2926
Evaporatio,0.01235928,0.561837
Radiant quantity,0.006809896,1.37449
Precipitation,0.05643346,748.115808
vegetation coverage,0.001422267,0.11054
Total,0.01707944,93.8731


### Iteration SVD 50%

In [97]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/svd/drop 50/complete_50_4k.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,0.00064,4.186323
Latitude,0.003061,3.163673
Temperature,0.0008,11.757387
Wind speed,0.953104,10.94247
Evaporatio,0.911165,23.0607
Radiant quantity,0.749769,55.250192
Precipitation,0.933881,27381.02182
vegetation coverage,0.501128,4.076328
Total,0.506694,3436.682362


### ALS 10%

In [99]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/als/als_10.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,3.279734e-07,0.094343
Latitude,9.199036e-07,0.054312
Temperature,3.844547e-07,0.256272
Wind speed,0.03442184,0.223881
Evaporatio,0.007160721,0.428212
Radiant quantity,0.004266737,1.088575
Precipitation,0.04056019,635.722481
vegetation coverage,0.000733847,0.07932
Total,0.01089312,79.743425


### ALS 20%

In [100]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/als/als_20.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,8.23488e-07,0.149517
Latitude,3.013996e-06,0.09828
Temperature,7.2881e-07,0.353099
Wind speed,0.1049184,0.384725
Evaporatio,0.01237976,0.56232
Radiant quantity,0.01271554,1.876579
Precipitation,0.1004963,992.644563
vegetation coverage,0.001943524,0.129164
Total,0.02905725,124.524781


### ALS 50%

In [101]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/data/als/als_50.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,2e-06,0.210502
Latitude,6e-06,0.137414
Temperature,2e-06,0.570637
Wind speed,0.241853,0.560983
Evaporatio,0.043977,1.051504
Radiant quantity,0.029046,2.82295
Precipitation,0.207749,1376.912629
vegetation coverage,0.003832,0.181227
Total,0.065808,172.805981


### SVD softImpute 10%

In [104]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/softImpute/softImpute_10.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,2.068905e-06,0.236823
Latitude,1.406118e-06,0.067131
Temperature,3.359194e-07,0.239586
Wind speed,0.04778788,0.269498
Evaporatio,0.005107231,0.362583
Radiant quantity,0.003522472,0.989845
Precipitation,0.03524772,596.938507
vegetation coverage,0.0008725833,0.086529
Total,0.01156771,74.898813


### SVD softImpute 20%

In [105]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/softImpute/softImpute_20.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,1.608543e-06,0.208856
Latitude,4.3186e-06,0.11764
Temperature,6.334802e-07,0.328965
Wind speed,0.07690975,0.336678
Evaporatio,0.0334084,0.948433
Radiant quantity,0.0215563,2.487717
Precipitation,0.0695797,843.946296
vegetation coverage,0.002511391,0.14701
Total,0.02549651,106.065199


### SVD softImpute 50%

In [106]:
df_original = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/4051 dataset unsupervised.csv")
df_imputated = pd.read_csv("/Users/william/Desktop/Assignment/24 Fall/STAT 4051/Project/data/softImpute/softImpute_50.csv")

result = calculate_metrics(df_imputated, df_original)
result

Unnamed: 0,Cosine Dissimilarity,RMSE
longitude,9e-06,0.490453
Latitude,1e-05,0.183138
Temperature,4e-06,0.837615
Wind speed,0.251593,0.600638
Evaporatio,0.108162,1.752289
Radiant quantity,0.040889,3.394931
Precipitation,0.241026,1581.611322
vegetation coverage,0.007983,0.263716
Total,0.08121,198.641763
