In [178]:
import pandas as pd
import numpy as np 
import preprocess
import ast
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import precision_score, accuracy_score, f1_score

In [4]:
entity_types=preprocess.literal2tag

In [6]:
entity_types

['age',
 'birth_date',
 'civil_status',
 'education_level',
 'employer',
 'firstname',
 'link',
 'lob',
 'maiden_name',
 'nationality',
 'observation',
 'occupation',
 'surname',
 'surname_household']

In [164]:
df=pd.read_csv('data.csv')
train,test=train_test_split(df,test_size=0.2)

In [151]:
train

Unnamed: 0,text,labels
13820,"['Badet', 'Eugène', 'leur fils', '6']","['surname', 'firstname', 'link', 'age']"
15666,"['Lacroix', 'Marie', 'couturière', 'chef']","['surname', 'firstname', 'occupation', 'link']"
9352,"['Quinsau', 'Marie', 'femme']","['surname', 'firstname', 'link']"
20240,"['Michot', 'Jean', 'propriétaire', '71']","['surname', 'firstname', 'occupation', 'age']"
12967,"['Jacquinot', 'Marcel', 'enfant', '2']","['surname', 'firstname', 'link', 'age']"
...,...,...
20975,"['Coz', 'Yves', 'tysserand', '52', 'Homme marié']","['surname_household', 'firstname', 'occupation..."
4914,"['Pensivy', 'Jean', 'cultiv', 'domestique', 'D...","['surname', 'firstname', 'occupation', 'link',..."
22497,"['St Ropts', 'Michel', 'tuilier', '36']","['surname', 'firstname', 'occupation', 'age']"
22766,"['Peltreau', 'Marie', 'aubergiste', 'femme', '...","['surname', 'firstname', 'occupation', 'link',..."


In [165]:
def get_token_values(df):
    token_values = {}

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        labels = ast.literal_eval(row['labels'])
        texts=ast.literal_eval(row['text'])

        # Iterate through each label in the row
        for i in range(len(labels)):
            # If the label is not already in the token_values dictionary, add it with an empty set
            if labels[i] not in token_values:
                token_values[labels[i]] = set()
            # Add the token value to the set for this label
            token_values[labels[i]].add(texts[i])

    return token_values

In [166]:
entities_by_type=get_token_values(train)

In [175]:
def naive_ner_baseline(text):
    predicted_tags=['' for i in range(len(text))]
    capital_letters=0
    for i in range(len(text)):
        token=text[i]
        if (token.isdigit() and int(token)>0 and int(token)<120):
            predicted_tags[i]='age'
        elif (token.isdigit() and int(token)<2000 and int(token)>1725):
            predicted_tags[i]='birth_date'
        elif token in entities_by_type['civil_status']:
            predicted_tags[i]='civil_status'
        elif token[0].isupper() and capital_letters==0:
            predicted_tags[i]=random.choice(['surname', 'surname_household'])
            capital_letters+=1
        elif token[0].isupper() and capital_letters==1:
            predicted_tags[i]='firstname'
            capital_letters+=1
        elif token[0].isupper() and capital_letters==2:
            predicted_tags[i]='employer'
        elif token in (entities_by_type['nationality']):
            predicted_tags[i]='nationality'
        elif predicted_tags[i] == 'surname' or predicted_tags[i] == 'firstname':
            continue  # Skip further processing if already classified as surname or firstname
        elif token in entities_by_type['occupation']:
            predicted_tags[i]='occupation'
        else:
            for word in token.split(' '):
                if word in ({'fils','fille','enfant','ménage','de','époux','épouse'}):
                    predicted_tags[i]='link'
        
    return str(predicted_tags)
        

In [176]:
for i in tqdm(list(test.index)):
    test.loc[i,'predicted_tags']=naive_ner_baseline(ast.literal_eval(test.loc[i,'text']))

100%|██████████| 4611/4611 [00:00<00:00, 7269.22it/s]


In [177]:
test

Unnamed: 0,text,labels,predicted_tags
297,"['Le Meur', 'Marie', 'couturière', 'seule', '27']","['surname_household', 'firstname', 'occupation...","['surname_household', 'firstname', 'occupation..."
2618,"['Robichon', 'Marguerite', 'employée', 'femme'...","['surname', 'firstname', 'occupation', 'link',...","['surname_household', 'firstname', 'occupation..."
3714,"['Goupil', 'Almire', 'chauffeur livreur', 'che...","['surname_household', 'firstname', 'occupation...","['surname', 'firstname', '', 'occupation', '',..."
2540,"['Armandin', 'Marie', 'cuisinière', 'domestiqu...","['surname', 'firstname', 'occupation', 'link',...","['surname', 'firstname', 'occupation', 'occupa..."
20338,"['Paquereau', 'Henri', 'domestique', 'chef', '...","['surname', 'firstname', 'occupation', 'link',...","['surname_household', 'firstname', 'occupation..."
...,...,...,...
11700,"['Niel', 'André', 's.p', 'fils 2ème lit', '192...","['surname', 'firstname', 'occupation', 'link',...","['surname', 'firstname', 'occupation', 'link',..."
15296,"['Dufieux', 'Marcel', 'Cultivateur', 'Chef', '...","['surname_household', 'firstname', 'occupation...","['surname_household', 'firstname', 'employer',..."
3543,"['Gallet', 'Amélie', 's.p', 'épouse', '36']","['surname', 'firstname', 'occupation', 'link',...","['surname', 'firstname', 'occupation', 'occupa..."
11624,"['Bulon', 'Marie', 'épouse', '41']","['surname', 'firstname', 'link', 'age']","['surname', 'firstname', 'occupation', 'age']"


In [200]:
predictions=[ast.literal_eval(pred) for pred in test['predicted_tags'].values.tolist()]
true_values=[ast.literal_eval(true) for true in test['labels'].values.tolist()]

In [236]:
def calculate_accuracy(predictions,true_values):
    acc=0
    length=0
    for i in range (len(predictions)):
        length=length+len(predictions[i])
        for j in range (len(predictions[i])):
            if (predictions[i][j]==true_values[i][j]):
                acc=acc+1
    return acc/length
            

In [239]:
from sklearn.metrics import precision_score, f1_score

def calculate_precision(predictions, true_values):
    # Flatten the predictions and true values
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_true_values = [item for sublist in true_values for item in sublist]
    # Calculate precision
    precision = precision_score(flat_true_values, flat_predictions, average='weighted')
    return precision

def calculate_f1_score(predictions, true_values):
    # Flatten the predictions and true values
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_true_values = [item for sublist in true_values for item in sublist]
    # Calculate F1 score
    f1 = f1_score(flat_true_values, flat_predictions, average='weighted')
    return f1


In [238]:
calculate_accuracy(predictions,true_values)

0.7179417493318838

In [240]:
calculate_precision(predictions,true_values)

  _warn_prf(average, modifier, msg_start, len(result))


0.8271476211472977

In [241]:
calculate_f1_score(predictions,true_values)

0.7109272450702819