# Importing Files

In [1]:
from google.colab import files
import pandas as pd

# Upload the files
print("\nUpload United Nations.xlsx:")
uploaded_UnitedNations = files.upload()


Upload United Nations.xlsx:


Saving UNdata_world_population.xlsx to UNdata_world_population.xlsx


In [2]:
# Define the file path for the UN dataset
UN_file_path = 'UNdata_world_population.xlsx'

# Import the UN dataset
UN_df = pd.read_excel(UN_file_path)

# Additional checks
print("\nColumn names in UN dataset:\n")
print(UN_df.columns)


Column names in UN dataset:

Index(['Country or Area', 'Year', 'Area', 'Sex', 'Record Type', 'Reliability',
       'Source Year', 'Value', 'Value Footnotes'],
      dtype='object')


# Schema Matching


### Schema Matching using Edit Distance for the United Nations Dataset

In [3]:
import pandas as pd
import numpy as np
from nltk.metrics import edit_distance

# Define the file path for the United Nations dataset
un_file_path = 'UNdata_world_population.xlsx'

# Import the United Nations dataset
un_df = pd.read_excel(un_file_path)

# Define the United Nations dataset columns and the mediated schema columns
un_columns = ['Country or Area', 'Year', 'Area', 'Sex', 'Record Type', 'Reliability', 'Source Year', 'Value', 'Value Footnotes']
mediated_schema_columns = ["CountryName", "Year", "Population", "Gender", "Continent"]

# Function to compute the Edit Distance and reverse normalize
def compute_reversed_normalized_edit_distance(col1, col2):
    distance = edit_distance(col1, col2)  # Compute Edit Distance
    max_len = max(len(col1), len(col2))
    reversed_normalized_distance = 1 - (distance / max_len)  # Reverse normalization
    return reversed_normalized_distance

# Initialize a dictionary to store reversed normalized distances
all_reversed_normalized_distances = {}

# Compute reversed normalized Edit Distance for each pair of columns
for mediated_col in mediated_schema_columns:
    all_reversed_normalized_distances[mediated_col] = {}
    for un_col in un_columns:
        reversed_normalized_distance = compute_reversed_normalized_edit_distance(mediated_col, un_col)
        all_reversed_normalized_distances[mediated_col][un_col] = reversed_normalized_distance

# Convert all_reversed_normalized_distances dictionary to a DataFrame
reversed_normalized_distances_df = pd.DataFrame(all_reversed_normalized_distances)

# Find the best matches
best_matches = {}
for mediated_col in mediated_schema_columns:
    best_match = max(all_reversed_normalized_distances[mediated_col], key=all_reversed_normalized_distances[mediated_col].get)
    best_matches[mediated_col] = best_match

# Display the best matches with reversed normalized distances
print("\nBest Matches with Reversed Normalized Edit Distances:")
for mediated_col, best_match in best_matches.items():
    print(f"{mediated_col} -> {best_match}: {all_reversed_normalized_distances[mediated_col][best_match]}")

# Display the DataFrame (optional)
# print(reversed_normalized_distances_df)

# Export all reversed normalized distances to Excel
output_file_reversed_normalized_distances = 'Reversed_Normalized_Edit_Distance_All_UN.xlsx'
reversed_normalized_distances_df.to_excel(output_file_reversed_normalized_distances, index=True)
print(f"\nReversed normalized distances exported to {output_file_reversed_normalized_distances}")



Best Matches with Reversed Normalized Edit Distances:
CountryName -> Country or Area: 0.5333333333333333
Year -> Year: 1.0
Population -> Value Footnotes: 0.19999999999999996
Gender -> Year: 0.33333333333333337
Continent -> Country or Area: 0.2666666666666667

Reversed normalized distances exported to Reversed_Normalized_Edit_Distance_All_UN.xlsx


### Schema Matching using Jaccard similarity for the United Naions Dataset

In [4]:
import pandas as pd
import numpy as np

# Define the file path for the United Nations dataset
un_file_path = 'UNdata_world_population.xlsx'

# Import the United Nations dataset
un_df = pd.read_excel(un_file_path)

# Define the United Nations dataset columns and the mediated schema columns
un_columns = ['Country or Area', 'Year', 'Area', 'Sex', 'Record Type', 'Reliability', 'Source Year', 'Value', 'Value Footnotes']
mediated_schema_columns = ["CountryName", "Year", "Population", "Gender", "Continent"]

# Function to compute the Jaccard Similarity
def compute_jaccard_similarity(col1, col2):
    set1 = set(col1)
    set2 = set(col2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Initialize variables to store best matches and similarities
best_matches = {}
similarities = {}

# Compute Jaccard Similarity for each pair of columns and find best matches
for mediated_col in mediated_schema_columns:
    max_similarity = -1
    best_match = None
    for un_col in un_columns:
        similarity = compute_jaccard_similarity(mediated_col, un_col)
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = un_col
    best_matches[mediated_col] = best_match
    similarities[mediated_col] = max_similarity

# Display the best matches with similarities
print("\nBest Matches with Jaccard Similarities:")
for mediated_col, best_match in best_matches.items():
    print(f"{mediated_col} -> {best_match}: {similarities[mediated_col]}")

# Compute Jaccard Similarity for each pair of columns
all_similarities = {}
for mediated_col in mediated_schema_columns:
    all_similarities[mediated_col] = {}
    for un_col in un_columns:
        similarity = compute_jaccard_similarity(mediated_col, un_col)
        all_similarities[mediated_col][un_col] = similarity

# Create a DataFrame from all_similarities dictionary
all_similarities_df = pd.DataFrame(all_similarities)

# Export all similarities to Excel
output_file_all_similarities = 'Jaccard_similarity_UNdata_all_similarities.xlsx'
all_similarities_df.to_excel(output_file_all_similarities, index=True)
print(f"\nCombined all similarities exported to {output_file_all_similarities}")

# Export best matches with similarities to Excel
output_best_matches_df = pd.DataFrame({
    'Mediated Column': list(similarities.keys()),
    'UN Column': [best_matches[col] for col in best_matches],
    'Jaccard Similarity': list(similarities.values())
})


Best Matches with Jaccard Similarities:
CountryName -> Country or Area: 0.6923076923076923
Year -> Year: 1.0
Population -> Value Footnotes: 0.42857142857142855
Gender -> Year: 0.2857142857142857
Continent -> Country or Area: 0.4166666666666667

Combined all similarities exported to Jaccard_similarity_UNdata_all_similarities.xlsx


### Schema Matching using Semantic similarity for the United Nations Dataset

In [5]:
import gdown

url = 'https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM'
output = 'GoogleNews-vectors-negative300.bin.gz'

gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
From (redirected): https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&confirm=t&uuid=6a25e363-f42a-40ba-a86d-7f014bc627d4
To: /content/GoogleNews-vectors-negative300.bin.gz
100%|██████████| 1.65G/1.65G [00:20<00:00, 80.2MB/s]


'GoogleNews-vectors-negative300.bin.gz'

In [6]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Load the Word2Vec model
model_path = 'GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(model_path, binary=True, limit=500000)  # Adjust limit if necessary

# Define the file path for the United Nations dataset
un_file_path = 'UNdata_world_population.xlsx'

# Import the United Nations dataset
un_df = pd.read_excel(un_file_path)

# Define the United Nations dataset columns and the mediated schema columns
un_columns = ['Country or Area', 'Year', 'Area', 'Sex', 'Record Type', 'Reliability', 'Source Year', 'Value', 'Value Footnotes']
mediated_schema_columns = ["CountryName", "Year", "Population", "Gender", "Continent"]

# Function to compute the semantic similarity
def compute_semantic_similarity(col1, col2):
    col1_words = [word for word in col1.split() if word in model]
    col2_words = [word for word in col2.split() if word in model]

    if not col1_words or not col2_words:
        return 0  # Return 0 similarity if any column name has no valid words in the model

    vec1 = np.mean([model[word] for word in col1_words], axis=0)
    vec2 = np.mean([model[word] for word in col2_words], axis=0)

    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return similarity

# Initialize variables to store best matches and similarities
best_matches = {}
similarities = {}

# Compute Semantic Similarity for each pair of columns and find best matches
for mediated_col in mediated_schema_columns:
    max_similarity = -1
    best_match = None
    for un_col in un_columns:
        similarity = compute_semantic_similarity(mediated_col, un_col)
        if similarity > max_similarity:
            max_similarity = similarity
            best_match = un_col
    best_matches[mediated_col] = best_match
    similarities[mediated_col] = max_similarity

# Normalize similarities to a range between 0 and 1
max_similarity = max(similarities.values())
for col, sim in similarities.items():
    normalized_similarity = sim / max_similarity
    similarities[col] = normalized_similarity

# Display the best matches with normalized similarities
print("\nBest Matches with Normalized Similarities:")
for mediated_col, best_match in best_matches.items():
    print(f"{mediated_col} -> {best_match}: {similarities[mediated_col]}")

# Compute Semantic Similarity for each pair of columns
all_similarities = {}
for mediated_col in mediated_schema_columns:
    all_similarities[mediated_col] = {}
    for un_col in un_columns:
        similarity = compute_semantic_similarity(mediated_col, un_col)
        all_similarities[mediated_col][un_col] = similarity

# Create a DataFrame from all_similarities dictionary
all_similarities_df = pd.DataFrame(all_similarities)

# Normalize all similarities to a range between 0 and 1
max_all_similarity = all_similarities_df.values.max()
all_similarities_df_normalized = all_similarities_df / max_all_similarity

# Export to Excel
output_file_all_similarities = 'Semantic_Similarity_UNdata_all_similarities_normalized.xlsx'
all_similarities_df_normalized.to_excel(output_file_all_similarities, index=True)
print(f"\nCombined normalized all similarities exported to {output_file_all_similarities}")

# Export normalized similarities to Excel
output_normalized_df = pd.DataFrame({
    'Mediated Column': list(similarities.keys()),
    'UN Column': [best_matches[col] for col in best_matches],
    'Normalized Similarity': list(similarities.values())
})



Best Matches with Normalized Similarities:
CountryName -> Country or Area: 0.0
Year -> Year: 1.0
Population -> Sex: 0.25272244215011597
Gender -> Sex: 0.3877527713775635
Continent -> Country or Area: 0.23518258333206177

Combined normalized all similarities exported to Semantic_Similarity_UNdata_all_similarities_normalized.xlsx


# Resultant Matrix 3 - Max Combiner

In [7]:
import pandas as pd

# Load the existing similarity results
# Edit Distance
reversed_normalized_distances_df = pd.read_excel('Reversed_Normalized_Edit_Distance_All_UN.xlsx', index_col=0)
# Jaccard Similarity
all_similarities_df = pd.read_excel('Jaccard_similarity_UNdata_all_similarities.xlsx', index_col=0)
# Semantic Similarity (normalized)
all_similarities_df_normalized = pd.read_excel('Semantic_Similarity_UNdata_all_similarities_normalized.xlsx', index_col=0)

# Initialize a DataFrame to store the combined maximum similarities
combined_similarity_df = pd.DataFrame(index=reversed_normalized_distances_df.index, columns=reversed_normalized_distances_df.columns)

# Iterate through each pair of columns and compute the maximum similarity
for col in combined_similarity_df.index:
    for col2 in combined_similarity_df.columns:
        max_similarity = max(reversed_normalized_distances_df.loc[col, col2],
                             all_similarities_df.loc[col, col2],
                             all_similarities_df_normalized.loc[col, col2])
        combined_similarity_df.loc[col, col2] = max_similarity

# Export to Excel
output_combined_similarity_file = 'ResultantMatrix3_MaxCombined_Similarity.xlsx'
combined_similarity_df.to_excel(output_combined_similarity_file, index=True)
print(f"\nCombined maximum similarity matrix exported to {output_combined_similarity_file}")

# Display the filtered DataFrame
print("\nCombined maximum similarity matrix:")
print(combined_similarity_df)



Combined maximum similarity matrix exported to ResultantMatrix3_MaxCombined_Similarity.xlsx

Combined maximum similarity matrix:
                CountryName      Year Population    Gender Continent
Country or Area    0.692308  0.283263   0.333333  0.230769  0.416667
Year                   0.25       1.0   0.129501  0.333333  0.162894
Area                   0.25       0.6   0.162981  0.285714  0.129565
Sex                0.076923      0.25   0.252722  0.387753     0.125
Record Type        0.235294  0.234283    0.16146      0.25  0.156111
Reliability        0.266667       0.2   0.307692  0.197197  0.272727
Source Year        0.333333  0.728169        0.2  0.272727  0.181818
Value              0.230769  0.285714   0.272727  0.218497   0.15037
Value Footnotes       0.375  0.153846   0.428571       0.2  0.307692


Threshold of 0.3 applied with One to One Cardinality

In [8]:
import pandas as pd
import numpy as np

# 'combined_similarity_df' is already available from previous computations

# Set the threshold
threshold = 0.3

# Filter the combined normalized DataFrame to include only values that meet or exceed the threshold
filtered_df = combined_similarity_df.applymap(lambda x: x if x >= threshold else np.nan)

# Export the filtered DataFrame to Excel
output_file_filtered = 'ResultantMatrix3_Threshold_Filtered.xlsx'
filtered_df.to_excel(output_file_filtered, index=True)
print(f"\nFiltered matching pairs that satisfy the threshold exported to {output_file_filtered}")

# Display the filtered DataFrame
print("\nFiltered Matching Pairs and Their Values (above threshold):")
print(filtered_df)


Filtered matching pairs that satisfy the threshold exported to ResultantMatrix3_Threshold_Filtered.xlsx

Filtered Matching Pairs and Their Values (above threshold):
                 CountryName      Year  Population    Gender  Continent
Country or Area     0.692308       NaN    0.333333       NaN   0.416667
Year                     NaN  1.000000         NaN  0.333333        NaN
Area                     NaN  0.600000         NaN       NaN        NaN
Sex                      NaN       NaN         NaN  0.387753        NaN
Record Type              NaN       NaN         NaN       NaN        NaN
Reliability              NaN       NaN    0.307692       NaN        NaN
Source Year         0.333333  0.728169         NaN       NaN        NaN
Value                    NaN       NaN         NaN       NaN        NaN
Value Footnotes     0.375000       NaN    0.428571       NaN   0.307692


In [9]:

# Ensure one-to-one cardinality
used_un_columns = set()
final_matches = {}

for mediated_col in mediated_schema_columns:
    best_match = None
    best_value = 0
    for un_col in un_columns:
        value = filtered_df.loc[un_col, mediated_col]
        if pd.notna(value) and value > best_value and un_col not in used_un_columns:
            best_match = un_col
            best_value = value
    if best_match:
        final_matches[mediated_col] = (best_match, best_value)
        used_un_columns.add(best_match)

# Create a DataFrame to store the final matches
final_df = pd.DataFrame(columns=mediated_schema_columns, index=un_columns)

for mediated_col, (un_col, value) in final_matches.items():
    final_df.loc[un_col, mediated_col] = value

# Export the final DataFrame to Excel
output_file_final = 'ResultantMatrix3_OneToOne_Final.xlsx'
final_df.to_excel(output_file_final, index=True)
print(f"\nFinal one-to-one matching pairs exported to {output_file_final}")

# Display the final DataFrame
print("\nFinal One-to-One Matching Pairs:")
print(final_df)



Final one-to-one matching pairs exported to ResultantMatrix3_OneToOne_Final.xlsx

Final One-to-One Matching Pairs:
                CountryName Year Population    Gender Continent
Country or Area    0.692308  NaN        NaN       NaN       NaN
Year                    NaN  1.0        NaN       NaN       NaN
Area                    NaN  NaN        NaN       NaN       NaN
Sex                     NaN  NaN        NaN  0.387753       NaN
Record Type             NaN  NaN        NaN       NaN       NaN
Reliability             NaN  NaN        NaN       NaN       NaN
Source Year             NaN  NaN        NaN       NaN       NaN
Value                   NaN  NaN        NaN       NaN       NaN
Value Footnotes         NaN  NaN   0.428571       NaN       NaN


## Performance Measurement - Resultant Matrix 3

In [10]:
# Import the Ground Truth File
from google.colab import files
import pandas as pd

# Upload the files
print("\nUpload United Nations Ground Truth.xlsx:")
uploaded_kaggle = files.upload()


Upload United Nations Ground Truth.xlsx:


Saving Ground Truth.xlsx to Ground Truth.xlsx


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Load Ground Truth data
ground_truth_file = 'Ground Truth.xlsx'
ground_truth_df = pd.read_excel(ground_truth_file, index_col=0)

# Load Final Matches data
final_file = 'ResultantMatrix3_OneToOne_Final.xlsx'
final_df = pd.read_excel(final_file, index_col=0)

# Ensure final_df and ground_truth_df have the same columns and indices
ground_truth_df = ground_truth_df[final_df.columns]

# Handle NaN values in final_df (replace NaN with 0 for simplicity, adjust as needed)
final_df = final_df.fillna(0)
ground_truth_df = ground_truth_df.fillna(0)

# Define a threshold for similarity scores
threshold = 0.5  # Adjust as needed based on your similarity scores

# Convert similarity scores to binary labels based on the threshold
y_pred = np.where(final_df.values >= threshold, 1, 0)
y_true = np.where(ground_truth_df.values > 0, 1, 0)

# Create the predicted match DataFrame
predicted_match_df = pd.DataFrame(y_pred, index=final_df.index, columns=final_df.columns)

# Print the predicted match DataFrame
print(predicted_match_df)
predicted_match_df.to_excel('ResultantMatrix3_Predicted_Match.xlsx')

# Calculate TP, FP, FN based on binary predictions and ground truth
TP = np.sum((y_pred == 1) & (y_true == 1))
FP = np.sum((y_pred == 1) & (y_true == 0))
FN = np.sum((y_pred == 0) & (y_true == 1))

# Calculate precision, recall, and F1 score
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"True Positives: {TP}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")

# Print results
print(f"\nPrecision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")



                 CountryName  Year  Population  Gender  Continent
Country or Area            1     0           0       0          0
Year                       0     1           0       0          0
Area                       0     0           0       0          0
Sex                        0     0           0       0          0
Record Type                0     0           0       0          0
Reliability                0     0           0       0          0
Source Year                0     0           0       0          0
Value                      0     0           0       0          0
Value Footnotes            0     0           0       0          0
True Positives: 2
False Positives: 0
False Negatives: 2

Precision: 100.00%
Recall: 50.00%
F1 Score: 66.67%


In [12]:
print(predicted_match_df)
print(ground_truth_df)

                 CountryName  Year  Population  Gender  Continent
Country or Area            1     0           0       0          0
Year                       0     1           0       0          0
Area                       0     0           0       0          0
Sex                        0     0           0       0          0
Record Type                0     0           0       0          0
Reliability                0     0           0       0          0
Source Year                0     0           0       0          0
Value                      0     0           0       0          0
Value Footnotes            0     0           0       0          0
                 CountryName  Year  Population  Gender  Continent
Country or Area            1     0           0       0          0
Year                       0     1           0       0          0
Area                       0     0           0       0          0
Sex                        0     0           0       1          0
Record Typ

# Resultant Matrix 4 - Avg Combiner

In [13]:
import pandas as pd

# Load the existing similarity results
# Edit Distance
reversed_normalized_distances_df = pd.read_excel('Reversed_Normalized_Edit_Distance_All_UN.xlsx', index_col=0)
# Jaccard Similarity
all_similarities_df = pd.read_excel('Jaccard_similarity_UNdata_all_similarities.xlsx', index_col=0)
# Semantic Similarity (normalized)
all_similarities_df_normalized = pd.read_excel('Semantic_Similarity_UNdata_all_similarities_normalized.xlsx', index_col=0)

# Initialize a DataFrame to store the combined average similarities
combined_similarity_df = pd.DataFrame(index=reversed_normalized_distances_df.index, columns=reversed_normalized_distances_df.columns)

# Iterate through each pair of columns and compute the average similarity
for col in combined_similarity_df.index:
    for col2 in combined_similarity_df.columns:
        avg_similarity = (
            reversed_normalized_distances_df.loc[col, col2] +
            all_similarities_df.loc[col, col2] +
            all_similarities_df_normalized.loc[col, col2]
        ) / 3
        combined_similarity_df.loc[col, col2] = avg_similarity

# Export to Excel
output_combined_similarity_file = 'ResultantMatrix4_AvgCombined_Similarity.xlsx'
combined_similarity_df.to_excel(output_combined_similarity_file, index=True)
print(f"\nCombined average similarity matrix exported to {output_combined_similarity_file}")

# Display the filtered DataFrame
print("\nCombined maximum similarity matrix:")
print(combined_similarity_df)




Combined average similarity matrix exported to ResultantMatrix4_AvgCombined_Similarity.xlsx

Combined maximum similarity matrix:
                CountryName      Year Population    Gender Continent
Country or Area    0.408547  0.199977   0.209362  0.155368  0.306172
Year               0.113636       1.0   0.104278  0.214888  0.128372
Area               0.143939  0.355004   0.115438  0.166466  0.117262
Sex                0.025641   0.17089   0.084241  0.232426   0.09693
Record Type        0.108734  0.194256   0.093036   0.16327  0.129959
Reliability        0.088889  0.157641   0.195098  0.123813  0.148564
Source Year        0.171717  0.512083   0.134296  0.174843  0.160105
Value              0.107226  0.140374   0.174703  0.165425  0.120494
Value Footnotes    0.169444   0.12213    0.24236  0.177373  0.200669


Threshold of 0.2 applied with One to One Cardinality

In [14]:
import pandas as pd
import numpy as np

# 'combined_similarity_df' is already available from previous computations

# Set the threshold
threshold = 0.2

# Filter the combined normalized DataFrame to include only values that meet or exceed the threshold
filtered_df = combined_similarity_df.applymap(lambda x: x if x >= threshold else np.nan)

# Export the filtered DataFrame to Excel
output_file_filtered = 'ResultantMatrix4_Threshold_Filtered.xlsx'
filtered_df.to_excel(output_file_filtered, index=True)
print(f"\nFiltered matching pairs that satisfy the threshold exported to {output_file_filtered}")

# Display the filtered DataFrame
print("\nFiltered Matching Pairs and Their Values (above threshold):")
print(filtered_df)


Filtered matching pairs that satisfy the threshold exported to ResultantMatrix4_Threshold_Filtered.xlsx

Filtered Matching Pairs and Their Values (above threshold):
                 CountryName      Year  Population    Gender  Continent
Country or Area     0.408547       NaN    0.209362       NaN   0.306172
Year                     NaN  1.000000         NaN  0.214888        NaN
Area                     NaN  0.355004         NaN       NaN        NaN
Sex                      NaN       NaN         NaN  0.232426        NaN
Record Type              NaN       NaN         NaN       NaN        NaN
Reliability              NaN       NaN         NaN       NaN        NaN
Source Year              NaN  0.512083         NaN       NaN        NaN
Value                    NaN       NaN         NaN       NaN        NaN
Value Footnotes          NaN       NaN    0.242360       NaN   0.200669


In [15]:
# Ensure one-to-one cardinality
used_un_columns = set()
final_matches = {}

for mediated_col in mediated_schema_columns:
    best_match = None
    best_value = 0
    for un_col in un_columns:
        value = filtered_df.loc[un_col, mediated_col]
        if pd.notna(value) and value > best_value and un_col not in used_un_columns:
            best_match = un_col
            best_value = value
    if best_match:
        final_matches[mediated_col] = (best_match, best_value)
        used_un_columns.add(best_match)

# Create a DataFrame to store the final matches
final_df = pd.DataFrame(columns=mediated_schema_columns, index=un_columns)

for mediated_col, (un_col, value) in final_matches.items():
    final_df.loc[un_col, mediated_col] = value

# Export the final DataFrame to Excel
output_file_final = 'ResultantMatrix4_OneToOne_Final.xlsx'
final_df.to_excel(output_file_final, index=True)
print(f"\nFinal one-to-one matching pairs exported to {output_file_final}")

# Display the final DataFrame
print("\nFinal One-to-One Matching Pairs:")
print(final_df)



Final one-to-one matching pairs exported to ResultantMatrix4_OneToOne_Final.xlsx

Final One-to-One Matching Pairs:
                CountryName Year Population    Gender Continent
Country or Area    0.408547  NaN        NaN       NaN       NaN
Year                    NaN  1.0        NaN       NaN       NaN
Area                    NaN  NaN        NaN       NaN       NaN
Sex                     NaN  NaN        NaN  0.232426       NaN
Record Type             NaN  NaN        NaN       NaN       NaN
Reliability             NaN  NaN        NaN       NaN       NaN
Source Year             NaN  NaN        NaN       NaN       NaN
Value                   NaN  NaN        NaN       NaN       NaN
Value Footnotes         NaN  NaN    0.24236       NaN       NaN


## Performance Measurement - Resultant Matrix 2

In [None]:
# Import the Ground Truth File
from google.colab import files
import pandas as pd

# Upload the files
print("\nUpload United Nations Ground Truth.xlsx:")
uploaded_kaggle = files.upload()


Upload United Nations Ground Truth.xlsx:


In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Load Ground Truth data
ground_truth_file = 'Ground Truth.xlsx'
ground_truth_df = pd.read_excel(ground_truth_file, index_col=0)

# Load Final Matches data
final_file = 'ResultantMatrix4_OneToOne_Final.xlsx'
final_df = pd.read_excel(final_file, index_col=0)

# Ensure final_df and ground_truth_df have the same columns and indices
ground_truth_df = ground_truth_df[final_df.columns]

# Handle NaN values in final_df (replace NaN with 0 for simplicity, adjust as needed)
final_df = final_df.fillna(0)
ground_truth_df = ground_truth_df.fillna(0)

# Define a threshold for similarity scores
threshold = 0.2  # Adjust as needed based on your similarity scores

# Convert similarity scores to binary labels based on the threshold
y_pred = np.where(final_df.values >= threshold, 1, 0)
y_true = np.where(ground_truth_df.values > 0, 1, 0)

# Create the predicted match DataFrame
predicted_match_df = pd.DataFrame(y_pred, index=final_df.index, columns=final_df.columns)

# Print the predicted match DataFrame
print(predicted_match_df)
predicted_match_df.to_excel('ResultantMatrix4_Predicted_Match.xlsx')

# Calculate TP, FP, FN based on binary predictions and ground truth
TP = np.sum((y_pred == 1) & (y_true == 1))
FP = np.sum((y_pred == 1) & (y_true == 0))
FN = np.sum((y_pred == 0) & (y_true == 1))

# Calculate precision, recall, and F1 score
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"True Positives: {TP}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")

# Print results
print(f"\nPrecision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


                 CountryName  Year  Population  Gender  Continent
Country or Area            1     0           0       0          0
Year                       0     1           0       0          0
Area                       0     0           0       0          0
Sex                        0     0           0       1          0
Record Type                0     0           0       0          0
Reliability                0     0           0       0          0
Source Year                0     0           0       0          0
Value                      0     0           0       0          0
Value Footnotes            0     0           1       0          0
True Positives: 3
False Positives: 1
False Negatives: 1

Precision: 75.00%
Recall: 75.00%
F1 Score: 75.00%
