In [None]:
import re
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../../data/prepared_candidates/candidates_prepared.csv')
df.head()

In [None]:
phone_pattern = re.compile(r'380\d{9}|0\d{9}')

def get_phone(text):
    text = text.replace(' ', '')
    match_val = phone_pattern.search(text)
    if match_val:
        return match_val.group()
    return None

# get phone
df['phone'] = df['CV'].apply(get_phone)
print("Count of CVs with phone number in CVs:", len(df[df['phone'].notna()]))
print("Percentage of CVs with phone number in CVs:", 100*len(df[df['phone'].notna()])/len(df))

# show 10 random phone
print(df['phone'].unique()[np.random.randint(0, len(df['phone'].unique()), 10)])

In [None]:
email_pattern = re.compile(r'\S+[@_](gmail\.com|ukr\.net|mail\.ru)')

def get_email(text):
    text = text.lower()
    match_val = email_pattern.search(text)
    if match_val:
        return match_val.group()
    return None

# get email
df['email'] = df['CV'].apply(get_email)
print("Count of CVs with email in CVs:", len(df[df['email'].notna()]))
print("Percentage of CVs with email in CVs:", 100*len(df[df['email'].notna()])/len(df))

# show 10 random emails
print(df['email'].unique()[np.random.randint(0, len(df['email'].unique()), 10)])

In [None]:
# compile the regex patterns
street_pattern = re.compile(r'домашня адреса|home address')

def get_street(text):
    text = text.lower()
    match_val = street_pattern.search(text)
    if match_val:
        return match_val.group()
    return None

# get street
df['street'] = df['CV'].apply(get_street)
print("Count of CVs with street in CVs:", len(df[df['street'].notna()]))
print("Percentage of CVs with street in CVs:", 100*len(df[df['street'].notna()])/len(df))

# show 10 random streets
print(df['street'].unique()[:10])

In [None]:
# check ІПН, РНОКПП, УНЗР, SNN
ipn_pattern = re.compile(r'ІПН|РНОКПП|УНЗР|SNN:')

def get_ipn(text):
    match_val = ipn_pattern.search(text)
    if match_val:
        print(text)
        return match_val.group()
    return None

# get ІПН
df['ipn'] = df['CV'].apply(get_ipn)
print("Count of CVs with ІПН in CVs:", len(df[df['ipn'].notna()]))
print("Percentage of CVs with ІПН in CVs:", 100*len(df[df['ipn'].notna()])/len(df))
# show 10 random ІПН
print(df['ipn'].unique()[:10])

In [None]:
# check social media links
social_pattern = re.compile(r'www\.(linkedin|facebook|instagram|twitter|vk|telegram|viber|whatsapp)')

def get_social(text):
    text = text.lower()
    return social_pattern.findall(text)

# get social media links
df['social'] = df['CV'].apply(get_social)
print("Count of CVs with social media links in CVs:", len(df[df['social'].apply(len) > 0]))
print("Percentage of CVs with social media links in CVs:", 100*len(df[df['social'].apply(len) > 0])/len(df))
# show 10 random social media links
print(df['social'].astype(str).unique()[:10])

In [None]:
# find socila media nicknames
social_pattern = re.compile(r'(linkedin|facebook|instagram|twitter|vk|vkontakte|telegram|viber|whatsapp):')

def get_social_nickname(text):
    text = text.lower()
    return social_pattern.findall(text)

# get social media nicknames
df['social_nickname'] = df['CV'].apply(get_social_nickname)
print("Count of CVs with social media nicknames in CVs:", len(df[df['social_nickname'].apply(len) > 0]))
print("Percentage of CVs with social media nicknames in CVs:", 100*len(df[df['social_nickname'].apply(len) > 0])/len(df))
# show 10 random social media nicknames
print(df['social_nickname'].astype(str).unique()[:10])

In [None]:
df.social = df.social.astype(str)
df.social_nickname = df.social_nickname.astype(str)
df = df[(df.social_nickname != '[]') | (df.social != '[]') | (df.ipn.notna()) | (df.street.notna()) | (df.email.notna()) |  (df.phone.notna())]
print(df.shape)
df.head()

In [None]:
df.to_csv('../../data/PII_CV.csv', index=False)

## Delete PII from datasets

In [None]:
data = pd.read_csv('../../data/djinni/candidates.csv')

In [None]:
data

In [None]:
df = df[data.columns]

In [None]:
#delete all rows from data which we have in df. for deleting use match by all columns
data = data[~data.apply(tuple,1).isin(df.apply(tuple,1))]


In [None]:
data

In [None]:
100*(295094-294825)/295094

In [None]:
data.to_csv('../../data/djinni/candidates.csv', index=False)

In [None]:
# delete from preprocessed datasets
data = pd.read_csv('../../data/prepared_candidates/candidates_prepared.csv')
print(data.shape)
df = pd.read_csv('../../data/PII_CV.csv')
# drop by id
data = data[~data.id.isin(df.id)]
print(data.shape)
data.to_csv('../../data/prepared_candidates/candidates_prepared.csv', index=False)

In [None]:
# delete from preprocessed datasets
data = pd.read_csv('../../data/prepared_candidates/intermediate_candidates_prepared.csv')
print(data.shape)
df = pd.read_csv('../../data/PII_CV.csv')
# drop by id
data = data[~data.id.isin(df.id)]
print(data.shape)
data.to_csv('../../data/prepared_candidates/intermediate_candidates_prepared.csv', index=False)