In [60]:
import pandas as pd
import re

# Load dataset
df = pd.read_excel("Journal_Articles.xlsx")  # Replace with your dataset

# Convert to lowercase and remove special characters for authors
def clean_text(text):
    text = str(text).lower().strip()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

df["Standardized_Name"] = df["Author_Name"].apply(clean_text)

print(df.head())  # Check cleaned names


      ID                                                URL  \
0  10681  https://www.sciencedirect.com/science/article/...   
1   9394  https://www.sciencedirect.com/science/article/...   
2   2575  https://www.sciencedirect.com/science/article/...   
3   5696  https://www.sciencedirect.com/science/article/...   
4  10114  https://www.sciencedirect.com/science/article/...   

              Journal_Title                  Volume_Issue  month_year  \
0  Decision Support Systems             Volume 3, Issue 2        1987   
1  Decision Support Systems            Volume 26, Issue 2        1999   
2  Decision Support Systems                    Volume 112        2018   
3  Decision Support Systems            Volume 52, Issue 3        2012   
4  Decision Support Systems  Volume 12, Issues 4Ã¢â‚¬â€œ5        1994   

                                            Abstract Keywords  \
0  A message based model of a Distributed Decisio...     "[]"   
1  In theories of strategic management, organizat...

In [64]:
df=df.fillna('NULL')

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert names to TF-IDF vectors (character-based n-grams)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf_matrix = vectorizer.fit_transform(df["Author_Name"])

print(tfidf_matrix.shape)  # Check matrix size


(10740, 8692)


In [66]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity scores
similarity_matrix = cosine_similarity(tfidf_matrix)

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=df["Author_Name"], columns=df["Author_Name"])

print(similarity_df.head())  # Check similarity scores


Author_Name          A Burns  A Rutges  A. Ãƒâ€“zaygen  A. Almansoori  \
Author_Name                                                             
A Burns             1.000000  0.026172        0.000000       0.042872   
A Rutges            0.026172  1.000000        0.025127       0.000000   
A. Ãƒâ€“zaygen      0.000000  0.025127        1.000000       0.067599   
A. Almansoori       0.042872  0.000000        0.067599       1.000000   
LuÃƒÂ­s A. Almeida  0.000000  0.000000        0.093744       0.299299   

Author_Name         LuÃƒÂ­s A. Almeida  A. Bosman  A. Bourouis  A. Dailianas  \
Author_Name                                                                    
A Burns                       0.000000   0.032677     0.069250      0.000000   
A Rutges                      0.000000   0.000000     0.000000      0.000000   
A. Ãƒâ€“zaygen                0.093744   0.082106     0.069012      0.075934   
A. Almansoori                 0.299299   0.177989     0.093751      0.110565   
LuÃƒÂ­s 

In [68]:
# Set similarity threshold (0.75 means 75% similar)
threshold = 0.75  

# Store matched names
matches = {}

for i, name in enumerate(df["Author_Name"]):
    for j, other_name in enumerate(df["Author_Name"]):
        if i != j and similarity_matrix[i][j] > threshold:
            matches.setdefault(name, []).append(other_name)

# Convert to DataFrame
matches_df = pd.DataFrame(list(matches.items()), columns=["Original Name", "Similar Names"])
print(matches_df)


              Original Name                                      Similar Names
0          Andy Philippakis                                   [A. Philippakis]
1     Andrew S. Philippakis                                   [A. Philippakis]
2       A. Tomasz Jarmoszko         [A. Tomasz Jarmoszko, A. Tomasz Jarmoszko]
3             A.B. Whinston  [A.B. Whinston, A.B. Whinston, A.B. Whinston, ...
4               A. Whinston  [A.B. Whinston, A.B. Whinston, A.B. Whinston, ...
...                     ...                                                ...
1980             Zhou Zhang  [Xiangzhou Zhang, Xiangzhou Zhang, Xiangzhou Z...
1981      Zhuo (June) Cheng             [Zhuo (June) Cheng, Zhuo (June) Cheng]
1982             Zilong Liu                           [Zilong Liu, Zilong Liu]
1983   Justin Zuopeng Zhang  [Zuopeng Zhang, Justin Zuopeng Zhang, Justin Z...
1984                   NULL  [NULL, NULL, NULL, NULL, NULL, NULL, NULL, NUL...

[1985 rows x 2 columns]


In [69]:
import pandas as pd

# Function to get the shortest similar name
def standardize_name(name, matches):
    if name in matches and isinstance(matches[name], list):  # Ensure it's a list
        valid_names = [str(n) for n in matches[name] if pd.notna(n)]  # Convert to strings & remove NaNs
        if valid_names:
            return min(valid_names, key=len)  # Choose the shortest version
    return name  # Keep original if no match

df["Standardized_Name"] = df["Author_Name"].apply(lambda x: standardize_name(x, matches))
df.to_csv("cleaned_authors.csv", index=False)

print(df[["Author_Name", "Standardized_Name"]].head())


          Author_Name   Standardized_Name
0             A Burns             A Burns
1            A Rutges            A Rutges
2      A. Ãƒâ€“zaygen      A. Ãƒâ€“zaygen
3       A. Almansoori       A. Almansoori
4  LuÃƒÂ­s A. Almeida  LuÃƒÂ­s A. Almeida
