In [44]:
import pandas as pd
import os 
import glob

dir_path = "/Users/simon/Documents/repo/cities-learning"

# Create the full search pattern
file_pattern = os.path.join(dir_path, "data/OpenAlex/02_NA_added/city_works_df_NA_abstr_added_*.feather")

# Find all matching files
files = glob.glob(file_pattern)

# Read all the files into a list of DataFrames
dfs = [pd.read_feather(file) for file in files]

# Optionally, concatenate them into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)


In [46]:
'''
print(combined_df.columns)

combined_df["abstract"][combined_df["abstract_filtering"] == "no translatable lang"]
print(combined_df["abstract_filtering"].unique())
print(f"abstracts entirely english: " + str(sum(combined_df["abstract_filtering"] == "already english")))
print(f"abstracts mostly english: " + str(sum(combined_df["abstract_filtering"] == "mostly english")))
print(f"abstracts filtered english part: " + str(sum(combined_df["abstract_filtering"] == "filtered english")))
print(f"abstracts translated: " + str(sum(combined_df["abstract_filtering"] == "translated full text")))
print(f"abstracts with no translatable lang: " + str(sum(combined_df["abstract_filtering"] == "no translatable lang")))
print(f"abstracts unknown: " + str(sum(combined_df["abstract_filtering"] == "unknown")))

'''

'\nprint(combined_df.columns)\n\ncombined_df["abstract"][combined_df["abstract_filtering"] == "no translatable lang"]\nprint(combined_df["abstract_filtering"].unique())\nprint(f"abstracts entirely english: " + str(sum(combined_df["abstract_filtering"] == "already english")))\nprint(f"abstracts mostly english: " + str(sum(combined_df["abstract_filtering"] == "mostly english")))\nprint(f"abstracts filtered english part: " + str(sum(combined_df["abstract_filtering"] == "filtered english")))\nprint(f"abstracts translated: " + str(sum(combined_df["abstract_filtering"] == "translated full text")))\nprint(f"abstracts with no translatable lang: " + str(sum(combined_df["abstract_filtering"] == "no translatable lang")))\nprint(f"abstracts unknown: " + str(sum(combined_df["abstract_filtering"] == "unknown")))\n\n'

In [47]:
pre_count = len(combined_df)
print("observations before exact deduplication:" +  str())
combined_df = combined_df.drop_duplicates(subset=["id"])
duplicates_dropped = pre_count-len(combined_df)
print("exact deduplicates dropped:" + str(duplicates_dropped))
print("observations after exact deduplication:" + str(len(combined_df)))
print("Observations with missing abstracts after exact deduplication: " + str(combined_df["abstract"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["title"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["authors"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["publication_year"].isna().sum()))
print(combined_df[combined_df["abstract"].isna()])

observations before exact deduplication:
exact deduplicates dropped:27392
observations after exact deduplication:254161
Observations with missing abstracts after exact deduplication: 11208
Observations with missing titles after exact deduplication: 339
Observations with missing titles after exact deduplication: 4957
Observations with missing titles after exact deduplication: 0
                                      id  \
2       https://openalex.org/W2541041155   
10      https://openalex.org/W2749538309   
14      https://openalex.org/W2275251747   
39      https://openalex.org/W1596891619   
55      https://openalex.org/W4378596524   
...                                  ...   
274235  https://openalex.org/W4234346229   
274494  https://openalex.org/W4241822235   
275274  https://openalex.org/W2793045527   
280902  https://openalex.org/W4200003656   
280931  https://openalex.org/W2000072941   

                                                    title abstract  \
2       Framing as so

In [50]:
from semhash import SemHash
import numpy as np

filtered_df = combined_df.copy()

cols_to_string = ["title", "authors", "abstract", "publication_year"]

for col in cols_to_string:
    filtered_df[col] = filtered_df[col].astype(str)

# Convert to list of dictionaries
records = filtered_df[["id", "title", "abstract", "authors", "publication_year"]].to_dict(orient="records")

# Now count missing values
missing_titles = sum(1 for rec in records if rec["title"] == "None")
missing_abstracts = sum(1 for rec in records if rec["abstract"] == "None")
missing_authors = sum(1 for rec in records if rec["authors"] == "None")
missing_pub_years = sum(1 for rec in records if rec["publication_year"] == "None")

print(f"Missing abstracts: {missing_abstracts}")
print(f"Missing titles: {missing_titles}")
print(f"Missing authors: {missing_authors}")
print(f"Missing publication years: {missing_pub_years}")


Missing abstracts: 11208
Missing titles: 339
Missing authors: 4957
Missing publication years: 0


In [None]:
from sentence_transformers import SentenceTransformer
import torch

for rec in records:
    rec['text'] = f"{rec['title']} {rec['abstract']}"

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")

# initialize SemHash with that model
semhash = SemHash.from_records(records=records, columns=["text"], model=model)

threshold = 0.88
dedup_result = semhash.self_deduplicate(threshold=threshold)

In [None]:
# Create a class so that I can call attributes 
from dataclasses import dataclass

@dataclass
class DuplicateRecord:
    record: dict
    exact: bool
    duplicates: list
    
# A list to collect records with no valid duplicates
non_matching_records = []

# A new list to store filtered duplicates that meet the match criteria
filtered_duplicates = []

# Iterate through all duplicate records
for dup in dedup_result.duplicates:
    original = dup.record
    original_year = int(float(original.get("publication_year")))

    # List to hold valid matches for this original record
    valid_matches = []

    for rec, score in dup.duplicates:
        duplicate_year = int(float(rec.get("publication_year")))

        has_abstract = rec.get('abstract_en') and rec['abstract_en'].strip().lower() != "none"
        has_title = rec.get('title') and rec['title'].strip().lower() != "none"
        # has_authors = rec.get('authors') and rec['authors'].strip().lower() != "none"
        
        close_years = abs(duplicate_year - original_year) <= 1
        match = False

        # Apply matching rules
        if close_years and has_abstract and has_title and score > 0.9:
            match = True
        elif close_years and has_abstract and not has_title and score > 0.9:
            match = True
        elif close_years and not has_abstract and has_title and score > 0.92:
            match = True
        # has to be specified for completeness but there are no applicable observations, here
        elif close_years and not has_abstract and not has_title and score > 0.99:
            match = True

        if match:
            valid_matches.append((rec, score))

    # Store record only if it has valid matches
    if valid_matches:
        filtered_duplicates.append(
            DuplicateRecord(record=original, duplicates=valid_matches, exact=False)
        )
    else:
        non_matching_records.append(original)

In [None]:
# check that the numbers are right:
print(f"Duplicates based on more restrictive criteria with individual thresholds: {sum(1 for record in filtered_duplicates)}")
print(f"Number of duplicates that are likely falsely detected as such: {len(non_matching_records)}")
print(f"Restricted duplicates and those duplicates that do not belong to the restricted duplicates but had initially been predicted as such = {sum(1 for record in filtered_duplicates) + len(non_matching_records)} must be equal to the initial number of duplicates = {len(dedup_result.duplicates)}")

In [None]:
# add false positive duplicates to the deduplicated list
filtered_deduplicated = dedup_result.deduplicated.copy()
filtered_deduplicated.extend(non_matching_records)

In [None]:
## transform to dfs 
# SemHash: duplicates and the deduplicated records from the semhashing as dataframes
deduplicated_hash_df = pd.DataFrame(dedup_result.deduplicated)
dup_records = [dup.record for dup in dedup_result.duplicates]
duplicates_hash_df = pd.DataFrame(dup_records)
# customized additional filtering:
filtered_duplicates_df = pd.DataFrame(filtered_duplicates)
filtered_deduplicated_df = pd.DataFrame(filtered_deduplicated)

In [None]:
# remove all those ids that are insided 
print("----------SemHash numbers-------------------------------------------------------------------------")
print(f"SemHash deduplicated: {len(deduplicated_hash_df)}")
print(f"SemHash duplicates: {len(duplicates_hash_df)}")
print("----------Restricted filtering numbers------------------------------------------------------------")
print(f"Restricted deduplicated: {len(filtered_duplicates)}")
print(f"Restricted duplicates: {len(filtered_deduplicated)}")
print("----------Additional------------------------------------------------------------------------------")
print(f"SemHash duplicates that are no duplicates according to restricted criteria: {len(duplicates_hash_df)- len(filtered_duplicates)}")
print(f"All wrongly assigned SemHash duplicates are correctly reasssigned: {len(filtered_deduplicated)-len(deduplicated_hash_df) == len(duplicates_hash_df) - len(filtered_duplicates)}")
print(f"Final percentage of removed: {(len(filtered_duplicates_df)/ (len(deduplicated_hash_df) + len(duplicates_hash_df))):.2%}")

In [None]:
def filter_duplicates_by_na_fields(duplicates, require_abstract=True, require_title=True):
    """
    Filters and prints the 10 duplicate groups with the lowest average score,
    where the original record meets the specified NA conditions.
    
    Args:
        duplicates (list): List of DuplicateRecord objects.
        require_abstract (bool): If True, abstract must NOT be NA.
        require_title (bool): If True, title must NOT be NA.
        require_authors (bool): If True, authors must NOT be NA.
    """
    if not duplicates:
        print("No duplicates to check.")
        return

    filtered = []

    for dup in duplicates:
        has_abstract = dup.record.get('abstract_en') and dup.record['abstract_en'].strip().lower() != "none"
        has_title = dup.record.get('title') and dup.record['title'].strip().lower() != "none"
        # has_authors = dup.record.get('authors') and dup.record['authors'].strip().lower() != "none"

        condition = (has_abstract == require_abstract) and (has_title == require_title)

        if condition and dup.duplicates:
            avg_score = sum(score for _, score in dup.duplicates) / len(dup.duplicates)
            filtered.append((avg_score, dup))

    if not filtered:
        print("\nNo duplicates found based on the specified NA conditions.")
        return

    # Sort by average score and take the 10 lowest
    filtered.sort(key=lambda x: x[0])
    filtered = filtered[:10]

    for avg_score, dup in filtered:
        print("\n=== DUPLICATE RECORD ===")
        print(f"Original Title : {dup.record.get('title', '')[:100]}")
        print(f"Authors        : {dup.record.get('authors', '')[:100]}")
        print(f"Abstract       : {dup.record.get('abstract_en', '')[:100]}")
        print(f"Year           : {str(dup.record.get('publication_year', ''))[:100]}")
        print(f"Exact Match    : {dup.exact}")
        print(f"Avg. Score     : {avg_score:.3f}")
        print("Duplicates:")
        for rec, score in dup.duplicates:
            print(f"  - Title   : {rec.get('title', '')[:100]}")
            print(f"    Authors : {rec.get('authors', '')[:100]}")
            print(f"    Abstract: {rec.get('abstract_en', '')[:100]}")
            print(f"    Year    : {str(rec.get('publication_year', ''))[:100]}")
            print(f"    Score   : {score:.3f}")


In [None]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=True)

In [None]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=False)

In [None]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=False)

In [None]:
import pandas as pd
import math
import os

os.chdir(dir_path)

chunk_size = 20000
total_records = len(filtered_deduplicated_df)
num_chunks = math.ceil(total_records / chunk_size)

for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, total_records)
    chunk = filtered_deduplicated_df.iloc[start:end]
    
    filename = f"data/OpenAlex/05_deduplicated/city_works_df_NA_abstr_added_dedup_{i+1}.parquet"
    chunk.to_parquet(filename)
    print(f"Saved {filename} with records {start} to {end-1}")
