In [20]:
import pandas as pd
from rapidfuzz import process, fuzz

# Sample data loading step
data = pd.read_csv('2008_2022_한국현대문학_인문_복합학_추가데이터_참고문헌_박완서.xlsx - Sheet1.csv')

nan_count = data['title'].isnull().sum()
print(f"Number of NaN values in the 'title' column: {nan_count}")

# Drop rows where 'title' or 'author' is NaN
data = data.dropna(subset=['title', 'author'])

Number of NaN values in the 'title' column: 7


In [21]:
data.groupby('author').size().sort_values(ascending=False).head(10)

author
박완서    863
김윤식    120
이선미    110
김은하     89
김양선     80
권명아     72
김은정     56
최인욱     55
김미영     51
신수정     51
dtype: int64

In [22]:
# Group data by author
for author, group in data.groupby('author'):
    titles = group['title'].unique()

    print(titles)

['1.20 학병사기, 1권']
['5․18 민중항쟁과 정치․역사․사회 2']
['Necropolitics']
['통과의례']
['감각의 박물학']
['호모 사케르']
['호모 사케르-주권 권력과 벌거벗은 생명' '예외상태']
['Le "Cycle du barrage" dans l\'oeuvre de Marguerite Duras']
['Willful subject']
['말, 살, 흙']
['박정희 정권과 농민의 연계성']
['The Cultural Politics of Emotion']
['Desire and Domestic Fiction: A Political History of the Novel']
['Untranslatability: The Case of Pronouns of Address in Literature']
['상상의 공동체']
['최근 여성학의 쟁점과여성주의윤리학']
['Modernity and Self-Identity : Self and Identity in the late Modern Age']
['Gender Roles and Wpmen’s Sleep in Mid and Later Life: a Quantitative Approach']
['To Save the Children of Korea : The Cold War Origins of International Adoption']
["l'autobiographie"]
['諷刺(Satire)']
['Touching ; The Human Significance of the skin']
['기억의 공간']
['변신']
['에티카']
['에티카']
['Idiomatisme et traduction']
['Le nom propre en traduction anglais ↔ francais, Gap']
['Introduction to Semantics and Translation, Horsleys Green, England']
['La traduction et la lettre ou l\'a

In [23]:
# Create a dictionary to hold the best match for each title based on the author
title_mapping = {}

# Group data by author
for author, group in data.groupby('author'):
    titles = group['title'].unique()  # unique titles for this author

    for title in titles:
        # Generate a list of other titles by the same author to compare against
        other_titles = [t for t in titles if t != title]

        if other_titles:
            # Find the most similar title to the current one (excluding the same title)
            best_match, score, _ = process.extractOne(title, other_titles, scorer=fuzz.token_sort_ratio)
            
            # Apply a threshold to consider as a match
            if score > 85:  # threshold of 80 out of 100
                title_mapping[(author, title)] = best_match
            else:
                title_mapping[(author, title)] = title  # no similar title found, map to itself
        else:
            # If no other titles are available, map the title to itself
            title_mapping[(author, title)] = title
            
# Apply the mapping to the 'title' column
data['unified_title'] = data.apply(lambda row: title_mapping[(row['author'], row['title'])], axis=1)



In [24]:
# 'unified_title' 열에서 각 제목의 빈도 계산
title_counts = data['unified_title'].value_counts()

# 빈도가 2 이상인 제목만 필터링
repeated_titles = title_counts[title_counts > 1].index.tolist()

# 'title' 열이 repeated_titles 리스트에 해당하는 경우, 'unified_title' 열을 'title' 열의 값으로 설정
data['unified_title'] = data.apply(lambda row: row['title'] if row['title'] in repeated_titles else row['unified_title'], axis=1)

In [25]:
# Save the results to a new CSV file
data.to_csv('p2_solved_thres85_token_sort_ratio.csv', encoding='utf-8-sig', index=False)

In [27]:
data['unified_title'].value_counts().head(50)

unified_title
한국 현대 노년소설 연구                                                                                                34
그 많던 싱아는 누가 다 먹었을까                                                                                           31
박완서 문학 길찾기                                                                                                   29
소설, 노년을 말하다                                                                                                  24
그 산이 정말 거기 있었을까                                                                                              22
나목                                                                                                           21
부끄러움을 가르칩니다                                                                                                  20
엄마의 말뚝 2                                                                                                     20
한국문학에 나타난 노인의식                                                                            

In [13]:
data[data['unified_title'] == '1990년대 페미니즘의 대중화, 그 직전의 풍경']
data[data['unified_title'] == '1990년대 페미니즘의 대중화, 그 직전의 풍경'].to_csv('unified_a_book.csv', encoding='utf-8-sig', index=False)