In [90]:
import pandas as pd
import numpy as np
import math
from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [58]:
bess_tags = pd.read_csv("CBW_Bess_tags_final.csv")

### Persona Mapping

In [255]:
### Getting a list of unique persona
unique_personas = bess_tags.personaName.unique()
unique_personas = list(unique_personas.astype('str'))

#Arranging based on the length of the string
unique_personas.sort(key = lambda s: len(s))

unique_personas = [each for each in unique_personas if each not in ['nan']]

persona_mapping = pd.DataFrame(columns=['Persona_main','Alternate_name'])
persona_mapping

Unnamed: 0,Persona_main,Alternate_name


In [None]:
## Creating Persona Mapping

persona_mapping = pd.DataFrame(columns=['Persona_main','Alternate_name','Score'])

for P1 in unique_personas:
    for P2 in unique_personas:
        
        persona_temp = pd.DataFrame()
        
        if((pd.isnull(P1))  | (pd.isnull(P2))):
            continue
        
        ## Checking if the Persona has already been added to the mapping file
        if( any(persona_mapping.Persona_main.isin([P2])) or any(persona_mapping.Alternate_name.isin([P2]))):
            continue
        
        fuzz_sort_score = fuzz.token_sort_ratio(P1,P2)
        fuzz_score = fuzz.partial_ratio(P1,P2)
        
        if((P1 != P2) and (( fuzz_score> 70) or ( fuzz_sort_score > 70))):
            #print(P1," and ", P2, ": ",fuzz_score) 
            
            persona_temp.loc[0,'Persona_main'] = P1
            persona_temp.loc[0,'Alternate_name'] = P2
            persona_temp.loc[0,'Score'] = np.mean([fuzz_score,fuzz_sort_score])
            
        persona_mapping = pd.concat([persona_mapping,persona_temp],axis = 0)

In [None]:
persona_mapping.sort_values(['Score'])

#### Note: 
Saving these results to a csv file and removing all the rows which are not actually matches

In [239]:
persona_mapping.to_csv("Persona_mapping.csv",index = False)

## Authors Mapping

In [244]:
### Getting a list of unique persona
unique_authors = bess_tags.author.unique()
unique_authors = list(unique_authors.astype('str'))

#Arranging based on the length of the string
unique_authors.sort(key = lambda s: len(s))

unique_authors = [each for each in unique_authors if each not in [' ','nan']]

author_mapping = pd.DataFrame(columns=['Author_main','Alternate_name','Score'])
author_mapping

Unnamed: 0,Author_main,Alternate_name,Score


In [252]:
## Creating Persona Mapping
for P1 in unique_authors:
    for P2 in unique_authors:
        
        author_temp = pd.DataFrame()
        
        if((pd.isnull(P1))  | (pd.isnull(P2))):
            continue
        
        ## Checking if the Persona has already been added to the mapping file
        if( any(author_mapping.Author_main.isin([P2])) or any(author_mapping.Alternate_name.isin([P2]))):
            continue
        
        fuzz_score = fuzz.token_sort_ratio(P1,P2)
        
        if((P1 != P2) and ( fuzz_score> 60)):
            #print(P1," and ", P2, ": ",fuzz_score) 
            
            author_temp.loc[0,'Author_main'] = P1
            author_temp.loc[0,'Alternate_name'] = P2
            author_temp.loc[0,'Score'] = fuzz_score
            
        author_mapping = pd.concat([author_mapping,author_temp],axis = 0)

In [253]:
author_mapping.sort_values('Score')

Unnamed: 0,Author_main,Alternate_name,Score
0,Sabine Baring-Gould,George Barnett Smith,51.0
0,Willis John Abbot,William Henry Davenport Adams,52.0
0,Myrtle Reed,A. J. Green Armytage,55.0
0,William Bolitho,Willis John Abbot,62.0
0,Jennie Chappell,"Cochrane, Jeanie",67.0
0,Hamilton Wright Mabie,Hamilton Wright Mable and Kate Stephens,67.0
0,William Bolitho,William Horton Foster,67.0
0,Millicent Garrett Fawcett,Millicent Garrett Fawcett and Steuart Beatrice,70.0
0,Hamilton Wright Mabie,"Hamilton Wright Mabie, Kate Stephens, eds.",70.0
0,Edmund B. D'Auvergne,Edmund Basil Francid D'Auvergne,76.0


#### Note: 
Saving these results to a csv file and removing all the rows which are not actually matches

In [248]:
author_mapping.to_csv("Author_mapping.csv",index = False)