In [356]:
import pandas as pd
import os 
import glob

dir_path = "/Users/simon/Documents/repo/cities-learning"

# Create the full search pattern
file_pattern = os.path.join(dir_path, "data/OpenAlex/city_works_df_NA_abstr_added*.feather")

# Find all matching files
files = glob.glob(file_pattern)

# Read all the files into a list of DataFrames
dfs = [pd.read_feather(file) for file in files]

# Optionally, concatenate them into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Optionally concatenate them into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [332]:
pre_count = len(combined_df)
print("observations before exact deduplication:" +  str())
combined_df = combined_df.drop_duplicates(subset=["id"])
duplicates_dropped = pre_count-len(combined_df)
print("exact deduplicates dropped:" + str(duplicates_dropped))
print("observations after exact deduplication:" + str(len(combined_df)))
print("Observations with missing abstracts after exact deduplication: " + str(combined_df["abstract"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["title"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["authors"].isna().sum()))
print("Observations with missing titles after exact deduplication: " + str(combined_df["publication_year"].isna().sum()))
print(combined_df[combined_df["abstract"].isna()])

observations before exact deduplication:
exact deduplicates dropped:676
observations after exact deduplication:251124
Observations with missing abstracts after exact deduplication: 11094
Observations with missing titles after exact deduplication: 337
Observations with missing titles after exact deduplication: 4878
Observations with missing titles after exact deduplication: 0
                                      id  \
8       https://openalex.org/W2796122528   
24      https://openalex.org/W2411062836   
45      https://openalex.org/W4286840151   
87      https://openalex.org/W3201109727   
184     https://openalex.org/W2473466218   
...                                  ...   
251604  https://openalex.org/W2611687965   
251611  https://openalex.org/W3202014191   
251613  https://openalex.org/W4312362558   
251640  https://openalex.org/W4404214858   
251736  https://openalex.org/W2785144348   

                                                    title abstract  \
8       Prikaz knjige F

In [333]:
from semhash import SemHash
import numpy as np

filtered_df = combined_df.copy()

cols_to_string = ["title", "authors", "abstract", "publication_year"]

for col in cols_to_string:
    filtered_df[col] = filtered_df[col].astype(str)

# Convert to list of dictionaries
records = filtered_df[["id", "title", "abstract", "authors", "publication_year"]].to_dict(orient="records")

# Now count missing values
missing_titles = sum(1 for rec in records if rec["title"] == "None")
missing_abstracts = sum(1 for rec in records if rec["abstract"] == "None")
missing_authors = sum(1 for rec in records if rec["authors"] == "None")
missing_pub_years = sum(1 for rec in records if rec["publication_year"] == "None")

print(f"Missing abstracts: {missing_abstracts}")
print(f"Missing titles: {missing_titles}")
print(f"Missing authors: {missing_authors}")
print(f"Missing publication years: {missing_pub_years}")


Missing abstracts: 11094
Missing titles: 337
Missing authors: 4878
Missing publication years: 0


In [334]:
from sentence_transformers import SentenceTransformer

# Load multilingual model
# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# initialize SemHash with that model
semhash = SemHash.from_records(records=records, columns=["title", "abstract", "authors", "publication_year"], 
                               #model=model
                              )

threshold = 0.85
dedup_result = semhash.self_deduplicate(threshold=threshold)

In [335]:
# Create a class so that I can call attributes 
@dataclass
class DuplicateRecord:
    record: dict
    exact: bool
    duplicates: list
    
# A list to collect records with no valid duplicates
non_matching_records = []

# A new list to store filtered duplicates that meet the match criteria
filtered_duplicates = []

# Iterate through all duplicate records
for dup in dedup_result.duplicates:
    original = dup.record
    original_year = int(original.get("publication_year"))

    # List to hold valid matches for this original record
    valid_matches = []

    for rec, score in dup.duplicates:
        duplicate_year = int(rec.get("publication_year"))

        has_abstract = rec.get('abstract') and rec['abstract'].strip().lower() != "none"
        has_title = rec.get('title') and rec['title'].strip().lower() != "none"
        has_authors = rec.get('authors') and rec['authors'].strip().lower() != "none"
        
        close_years = abs(duplicate_year - original_year) <= 1
        match = False

        # Apply matching rules
        if close_years and has_abstract and has_authors and has_title and score > 0.85:
            match = True
        elif close_years and has_abstract and has_authors and not has_title and score > 0.88:
            match = True
        elif close_years and has_abstract and not has_authors and has_title and score > 0.92:
            match = True
        elif close_years and not has_abstract and has_authors and has_title and score > 0.90:
            match = True
        elif close_years and has_abstract and not has_title and has_authors and score > 0.90:
            match = True
        elif close_years and has_abstract and not has_title and not has_authors and score > 0.94:
            match = True
        elif close_years and not has_abstract and has_title and not has_authors and score > 0.97:
            match = True
        elif close_years and not has_abstract and not has_title and has_authors and score > 0.99:
            match = True
        elif close_years and not has_abstract and not has_authors and not has_title and score > 0.99:
            match = True

        if match:
            valid_matches.append((rec, score))

    # Store record only if it has valid matches
    if valid_matches:
        filtered_duplicates.append(
            DuplicateRecord(record=original, duplicates=valid_matches, exact=False)
        )
    else:
        non_matching_records.append(original)

In [337]:
# check that the numbers are right:
print(f"Duplicates based on more restrictive criteria with individual thresholds: {sum(1 for record in filtered_duplicates)}")
print(f"Number of duplicates that are likely falsely detected as such: {len(non_matching_records)}")
print(f"Restricted duplicates and those duplicates that do not belong to the restricted duplicates but had initially been predicted as such = {sum(1 for record in filtered_duplicates) + len(non_matching_records)} must be equal to the initial number of duplicates = {len(dedup_result.duplicates)}")

Duplicates based on more restrictive criteria with individual thresholds: 17384
Number of duplicates that are likely falsely detected as such: 1775
Restricted duplicates and those duplicates that do not belong to the restricted duplicates but had initially been predicted as such = 19159 must be equal to the initial number of duplicates = 19159


In [339]:
# add false positive duplicates to the deduplicated list
filtered_deduplicated = dedup_result.deduplicated.copy()
filtered_deduplicated.extend(non_matching_records)

In [340]:
## transform to dfs 
# SemHash: duplicates and the deduplicated records from the semhashing as dataframes
deduplicated_hash_df = pd.DataFrame(dedup_result.deduplicated)
dup_records = [dup.record for dup in dedup_result.duplicates]
duplicates_hash_df = pd.DataFrame(dup_records)
# customized additional filtering:
filtered_duplicates_df = pd.DataFrame(filtered_duplicates)
filtered_deduplicated_df = pd.DataFrame(filtered_deduplicated)

In [329]:
print(total_records)
print(total_records)

{'id': 'https://openalex.org/W4244269897', 'title': 'Fig. 5. Map displaying the slope (colour scale) and adjusted R2 (size scale) for the linear regression of green infrastructure variables on per capita income. A separate linear model was performed for each municipal district (see Fig. 1), and points reflect the municipality centroids. The reader can interpret these values as the slope and fit of the linear trend lines plotted in Fig. 3 except here they are stratified by district municipality and not race.', 'abstract': 'Urban green infrastructure provides ecosystem services that are essential to human wellbeing. A dearth of national-scale assessments in the Global South has precluded the ability to explore how political regimes, such as the forced racial segregation in South Africa during and after Apartheid, have influenced the extent of and access to green infrastructure over time. We investigate whether there are disparities in green infrastructure distributions across race and in

In [375]:
# remove all those ids that are insided 
print("----------SemHash numbers-------------------------------------------------------------------------")
print(f"SemHash deduplicated: {len(deduplicated_hash_df)}")
print(f"SemHash duplicates: {len(duplicates_hash_df)}")
print("----------Restricted filtering numbers------------------------------------------------------------")
print(f"Restricted deduplicated: {len(filtered_duplicates)}")
print(f"Restricted duplicates: {len(filtered_deduplicated)}")
print("----------Additional------------------------------------------------------------------------------")
print(f"SemHash duplicates that are no duplicates according to restricted criteria: {len(duplicates_hash_df)- len(filtered_duplicates)}")
print(f"All wrongly assigned SemHash duplicates are correctly reasssigned: {len(filtered_deduplicated)-len(deduplicated_hash_df) == len(duplicates_hash_df) - len(filtered_duplicates)}")
print(f"Final percentage of removed: {(len(filtered_duplicates_df)/ (len(deduplicated_hash_df) + len(duplicates_hash_df))):.2%}")

----------SemHash numbers-------------------------------------------------------------------------
SemHash deduplicated: 231965
SemHash duplicates: 19159
----------Restricted filtering numbers------------------------------------------------------------
Restricted deduplicated: 17384
Restricted duplicates: 233740
----------Additional------------------------------------------------------------------------------
SemHash duplicates that are no duplicates according to restricted criteria: 1775
All wrongly assigned SemHash duplicates are correctly reasssigned: True
Final percentage of removed: 6.92%


In [363]:
def filter_duplicates_by_na_fields(duplicates, require_abstract=True, require_title=True, require_authors=True):
    """
    Filters and prints the 10 duplicate groups with the lowest average score,
    where the original record meets the specified NA conditions.
    
    Args:
        duplicates (list): List of DuplicateRecord objects.
        require_abstract (bool): If True, abstract must NOT be NA.
        require_title (bool): If True, title must NOT be NA.
        require_authors (bool): If True, authors must NOT be NA.
    """
    if not duplicates:
        print("No duplicates to check.")
        return

    filtered = []

    for dup in duplicates:
        has_abstract = dup.record.get('abstract') and dup.record['abstract'].strip().lower() != "none"
        has_title = dup.record.get('title') and dup.record['title'].strip().lower() != "none"
        has_authors = dup.record.get('authors') and dup.record['authors'].strip().lower() != "none"

        condition = (has_abstract == require_abstract) and (has_title == require_title) and (has_authors == require_authors)

        if condition and dup.duplicates:
            avg_score = sum(score for _, score in dup.duplicates) / len(dup.duplicates)
            filtered.append((avg_score, dup))

    if not filtered:
        print("\nNo duplicates found based on the specified NA conditions.")
        return

    # Sort by average score and take the 10 lowest
    filtered.sort(key=lambda x: x[0])
    filtered = filtered[:10]

    for avg_score, dup in filtered:
        print("\n=== DUPLICATE RECORD ===")
        print(f"Original Title : {dup.record.get('title', '')[:100]}")
        print(f"Authors        : {dup.record.get('authors', '')[:100]}")
        print(f"Abstract       : {dup.record.get('abstract', '')[:100]}")
        print(f"Year           : {str(dup.record.get('publication_year', ''))[:100]}")
        print(f"Exact Match    : {dup.exact}")
        print(f"Avg. Score     : {avg_score:.3f}")
        print("Duplicates:")
        for rec, score in dup.duplicates:
            print(f"  - Title   : {rec.get('title', '')[:100]}")
            print(f"    Authors : {rec.get('authors', '')[:100]}")
            print(f"    Abstract: {rec.get('abstract', '')[:100]}")
            print(f"    Year    : {str(rec.get('publication_year', ''))[:100]}")
            print(f"    Score   : {score:.3f}")


In [364]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=True, require_authors=True)


=== DUPLICATE RECORD ===
Original Title : Temperature of Paved Streets in Urban Mockups and Its Implication of Reflective Cool Pavements
Authors        : Yi Zhang, Peiyuan Wei, Lei Wang, Yinghong Qin
Abstract       : In summer, urban heat islands increase building cooling demands, aggravate air pollution, and cause 
Year           : 2021
Exact Match    : False
Avg. Score     : 0.850
Duplicates:
  - Title   : Experimental Study on the Thermal Characteristics of Urban Mockups With Different Paved Streets
    Authors : Yinghong Qin, Peiyuan Wei, Junsong Wang, Kanghao Tan
    Abstract: Abstract Pavements in urban area absorb more sunlight due to the canyon-like geomorphology of the ur
    Year    : 2021
    Score   : 0.850

=== DUPLICATE RECORD ===
Original Title : Enhancing Sustainable Urban Design and Development Through Data-Driven Approaches and Interpretable 
Authors        : Nasim Eslamirad, Mahdi Rasoulinezhad, Francesco De Luca, Sadok Ben Yahia, Kimmo Lylykangas
Abstract       : W

In [352]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=True, require_authors=False)


=== DUPLICATE RECORD ===
Original Title : Preface
Authors        : None
Abstract       : Organized by Beijing Jiaotong University (Beijing, China), the 2021 2nd International Symposium on W
Year           : 2022
Exact Match    : False
Avg. Score     : 0.921
Duplicates:
  - Title   : Preface
    Authors : None
    Abstract: 3 rd International Conference on Environmental Design (ICED2022) is focused on the most recent scien
    Year    : 2022
    Score   : 0.921

=== DUPLICATE RECORD ===
Original Title : Probability distributions for changes in regional employment shares of manufacturing
Authors        : None
Abstract       : Manufacturing, a cornerstone of economic growth and development, has undergone profound changes in r
Year           : 2023
Exact Match    : False
Avg. Score     : 0.921
Duplicates:
  - Title   : Distribution of manufacturing employment in TL3 regions, 2018
    Authors : None
    Abstract: Manufacturing, a cornerstone of economic growth and development, has undergon

In [365]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=False, require_authors=True)


No duplicates found based on the specified NA conditions.


In [366]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=True, require_title=False, require_authors=False)


=== DUPLICATE RECORD ===
Original Title : None
Authors        : None
Abstract       : ASEAN is regarded as an economically dynamic region with notable policies towards economic openness,
Year           : 2019
Exact Match    : False
Avg. Score     : 0.943
Duplicates:
  - Title   : None
    Authors : None
    Abstract: This study examines the impacts of crucial factors associated with Vietnam's socio-economic developm
    Year    : 2018
    Score   : 0.943

=== DUPLICATE RECORD ===
Original Title : None
Authors        : None
Abstract       : This study examines the impacts of crucial factors associated with Vietnam's socio-economic developm
Year           : 2018
Exact Match    : False
Avg. Score     : 0.943
Duplicates:
  - Title   : None
    Authors : None
    Abstract: ASEAN is regarded as an economically dynamic region with notable policies towards economic openness,
    Year    : 2019
    Score   : 0.943

=== DUPLICATE RECORD ===
Original Title : None
Authors        : None
Abstract  

In [367]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=False, require_title=True, require_authors=True)


=== DUPLICATE RECORD ===
Original Title : Spatial Effects and Influencing Factors of Urban Carbon Emissions in the Yangtze River Economic Belt
Authors        : 欣玥 庞
Abstract       : None
Year           : 2023
Exact Match    : False
Avg. Score     : 0.900
Duplicates:
  - Title   : Analysis of the Spatial and Temporal Distribution of Carbon Footprints and Influencing Factors in Sm
    Authors : 兰 陈
    Abstract: None
    Year    : 2023
    Score   : 0.900

=== DUPLICATE RECORD ===
Original Title : To Opt-in or to Cop Out: COP26 and the Policy Dynamics of Decarbonising African Cities
Authors        : Ayodele Asekomeh, Obindah Gershon, Smith I. Azubuike
Abstract       : None
Year           : 2022
Exact Match    : False
Avg. Score     : 0.901
Duplicates:
  - Title   : Introduction: Decarbonising African Cities in a Carbon-Constrained World
    Authors : Smith I. Azubuike, Obindah Gershon, Ayodele Asekomeh
    Abstract: None
    Year    : 2022
    Score   : 0.901

=== DUPLICATE RECORD ===
O

In [368]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=False, require_title=True, require_authors=False)


=== DUPLICATE RECORD ===
Original Title : Climate-Resilient City
Authors        : None
Abstract       : None
Year           : 2022
Exact Match    : False
Avg. Score     : 0.973
Duplicates:
  - Title   : Urban Climate Resilience
    Authors : None
    Abstract: None
    Year    : 2022
    Score   : 0.973

=== DUPLICATE RECORD ===
Original Title : Urban Climate Resilience
Authors        : None
Abstract       : None
Year           : 2022
Exact Match    : False
Avg. Score     : 0.973
Duplicates:
  - Title   : Climate-Resilient City
    Authors : None
    Abstract: None
    Year    : 2022
    Score   : 0.973

=== DUPLICATE RECORD ===
Original Title : CO2 emissions from urban transport by mode and scenario
Authors        : None
Abstract       : None
Year           : 2019
Exact Match    : False
Avg. Score     : 0.973
Duplicates:
  - Title   : CO2 emissions from domestic non-urban passenger transport by mode and scenario
    Authors : None
    Abstract: None
    Year    : 2019
    Score   : 0

In [369]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=False, require_title=False, require_authors=True)


No duplicates found based on the specified NA conditions.


In [370]:
filter_duplicates_by_na_fields(filtered_duplicates, require_abstract=False, require_title=False, require_authors=False)


No duplicates found based on the specified NA conditions.


In [382]:
import pandas as pd
import math
import os

os.chdir(dir_path)

chunk_size = 20000
total_records = len(filtered_deduplicated_df)
num_chunks = math.ceil(total_records / chunk_size)

for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, total_records)
    chunk = filtered_deduplicated_df.iloc[start:end]
    
    filename = f"data/OpenAlex/04_deduplicated/city_works_df_NA_abstr_added_dedup_{i+1}.parquet"
    chunk.to_parquet(filename)
    print(f"Saved {filename} with records {start} to {end-1}")


Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_1.parquet with records 0 to 19999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_2.parquet with records 20000 to 39999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_3.parquet with records 40000 to 59999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_4.parquet with records 60000 to 79999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_5.parquet with records 80000 to 99999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_6.parquet with records 100000 to 119999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_7.parquet with records 120000 to 139999
Saved data/OpenAlex/deduplicated_and_translated/city_works_df_NA_abstr_added_dedup_8.parquet with records 140000 to 159999
Saved data/OpenAlex/deduplicat