# RL FUNCTIONS VINGETTE

### Authors: Karam Mukhtar, Pranay Lolabattu

#### Part 1A: Modified Levenstein distance measurement.

In [2]:
%%time
import stringdist
import numpy as np
import threading
from fuzzywuzzy import fuzz

def levenstein(seq1, seq2):
    distances = []
    for i in range(len(seq1)):
        a = seq1[i]
        b = seq2[i]
        max_len = max(len(a), len(b))
        distances.append((max_len - stringdist.levenshtein(a, b)) / max_len)
    return distances

def lev_single(a, b):
    max_len = max(len(a), len(b))
    return (max_len - stringdist.levenshtein(a, b)) / max_len 
#test:
s_1 = ['antelope', 'koor', 'croatia']
s_2 = ['jakarta', 'roo', 'lithuania']
levenstein(s_1, s_2)

Wall time: 333 ms




#### Part 1B: Find The Indices of missing values in the dataframe

In [2]:
%%time
import pandas as pd

#create pandas dataframes with our data
S1 = pd.read_csv('S1.csv') 
S2 = pd.read_csv('S2.csv')

#Takes in a pandas dataframe and returns a dictionary mapping column names to the missing value indices for that column
def missing_value_indices(df): #df is a pandas dataframe with columns such as first name, last name, DOB, sex, etc.
    index_dict = {}
    columns = df.columns #names of each of the columns in the dataframe i.e. 'sex'
    for i in columns:
        index_dict[i] = list(df[i][df[i].isna()].index)
    return index_dict

Wall time: 577 ms


#### Part 1C: Find the Indices of all unique values in the Dataframe

In [3]:
%%time
# A dataframe has to be made before passing it into the function. simply name the dataframe and then insert that name into the paranthesis of the function unique.
def unique(dataframe):
    df_dict = {}
    for column in dataframe.columns:
        df_dict[column] = {}
        for i in np.arange(len(dataframe[column])):
            value = dataframe[column].get(i)
            if value in df_dict[column]:
                df_dict[column][value] += [i]
            else:
                df_dict[column][value] = []
                df_dict[column][value] += [i]
    return df_dict
print(unique(S1))

{'sex': {0: [0, 3, 4, 5, 8, 9, 12, 15, 16, 19, 21, 23, 25, 26, 29, 30, 32, 35, 39, 41, 42, 43, 44, 46, 51, 52, 53, 55, 57, 58, 62, 65, 66, 67, 68, 70, 71, 73, 74, 76, 77, 79, 81, 82, 83, 84, 85, 86, 89, 91, 92, 93, 97, 99, 101, 102, 103, 104, 105, 106, 109, 110, 111, 116, 117, 119, 120, 121, 123, 125, 126, 128, 131, 132, 134, 136, 137, 138, 139, 143, 144, 148, 151, 152, 153, 154, 155, 156, 158, 159, 161, 162, 163, 164, 166, 168, 170, 173, 176, 177, 179, 186, 188, 191, 192, 193, 194, 195, 196, 199, 203, 204, 206, 208, 209, 210, 219, 221, 222, 224, 226, 228, 233, 234, 235, 237, 240, 242, 246, 248, 252, 253, 254, 255, 256, 257, 259, 263, 273, 281, 285, 286, 288, 290, 292, 299, 300, 301, 302, 303, 304, 310, 312, 313, 316, 317, 321, 322, 323, 324, 325, 326, 327, 330, 331, 334, 335, 337, 338, 341, 342, 344, 345, 346, 349, 353, 356, 357, 358, 365, 367, 368, 369, 370, 375, 376, 377, 379, 382, 383, 387, 388, 390, 392, 394, 395, 396, 397, 399, 400, 402, 403, 405, 406, 411, 412, 414, 418, 419, 42

#### Part 1D: pylib2 and FClib2

In [4]:
%%time
pylib2 = pd.read_csv('pylib2.csv')
pylib_dict = {}
for i in np.arange(pylib2.shape[0]):
    if ',' in pylib2['values'][i]:
        pylib_dict[pylib2['ind'][i]] = '[' + pylib2['values'][i] + ']'
    else:
        pylib_dict[pylib2['ind'][i]] = pylib2['values'][i]

def str_to_py(input):
    a = list(input)
    result_list = [pylib_dict[i] if i in pylib_dict else i for i in a]
    return ''.join(result_list)

def list_to_py(input):
    return [str_to_py(i) for i in input]

Wall time: 916 ms


In [5]:
%%time
FClib2 = pd.read_csv('FClib2.csv')
FC_dict = {}
for i in np.arange(FClib2.shape[0]):
    FC_dict[FClib2['ind'][i]] = FClib2['values'][i]

def str_to_FC(input):
    a = list(input)
    result_list = [FC_dict[i] if i in FC_dict else i for i in a]
    return ''.join(result_list)

def list_to_FC(input):
    return [str_to_FC(i) for i in input]

Wall time: 2.62 s


#### Part 1E: A Function that removes all the elements inside the inputted variable lst (which is of list type) from the inputted string variable.

In [6]:
%%time
def remove_any_from_str(string, lst):
    for i in string:
        if i in lst:
            string = string.replace(i, '')
    return string

Wall time: 0 ns


In [32]:
remove_any_from_str("alphabetic", ['a', 'b', 'c'])

'lpheti'

 #### Part 1F: building the sparse martrix.

In [7]:
%%time
def build_sparse_matrix(list1, list2):
    df = pd.DataFrame()
    df['index'] = list1
    for i in list2:
        df[i] = [1 if i == j else 0 for j in df['index'].tolist()]
    df.set_index('index', inplace=True)
    return df

Wall time: 0 ns


In [8]:
%%time
def compare_unique(list1, list2, sim_func):
    sim_dict = {}
    for i in list1:
        for j in list2:
            comparison = sim_func(i, j)
            if comparison in sim_dict:
                sim_dict[comparison].add((i, j))
            else:
                sim_dict[comparison] = set()
                sim_dict[comparison].add((i, j))
    return sim_dict

Wall time: 0 ns


#### Part 2A: Building the sparse matrix for each column in the dataframe:
* This function takes in 5 variables 2 lists of unique values corrisponding to the 2 dataframes S1 and S2. 
* sim variable which corrispondes to the method of measuring the similarity of the strings, all of the following examples employ the use of the fuzz.ratio() funtion from the fuzzywuzzy library which must be installed prior to importing.
* threshold represents the threshold value for the similarity calculation. In otherwords the degree of similarity above which all values will be considered matching.
* indices_1 and indices_2 represent the indices corresponding to the unique values inside all of the columns. 

In [9]:
def build_sparse_from_sim(unique_1, unique_2, sim, threshold, indices_1, indices_2):
    result = set()
    for i in unique_1:
        for j in unique_2:
            if sim(i, j) >= threshold:
                for a in indices_1[i]:
                    for b in indices_2[j]:
                        result.add((a, b))
    return result

#### Part 2B: Unique dictionary functions that return dictionary class items of the unique variables and their indices

In [10]:
%%time
unique_dict_1 = unique(S1)
unique_dict_2 = unique(S2)
print(type(unique_dict_1))

<class 'dict'>
Wall time: 2.84 s


In [11]:
%%time
a = np.unique(S1['lname.py'].tolist())
b = np.unique(S2['lname.py'].tolist())
c = fuzz.ratio
d = 85
e = unique_dict_1['lname.py']
f = unique_dict_2['lname.py']

['[bai4,bai5]' '[bao4,bao1]' '[bian5,bian1]' '[bie4,bie2]' '[bin1,bang1]'
 '[bo2,bo5]' '[bo5,bu3]' '[bu5,bu3]' '[chong2,zhong4]' '[chong4,chong1]'
 '[chuang3,chen4]' '[chuang4,cang1]' '[dang5,dang1]' '[de5,di4]'
 '[diao4,diao5]' '[ding5,ding1]' '[du2,dou4]' '[feng1,fan2]' '[fu2,fei4]'
 '[fu5,fu4]' '[gan4,gan1]' '[ge3,gai4]' '[guan4,guan1]' '[guang4,an1]'
 '[guo5,guo1]' '[he2,he4]' '[heng2,geng4]' '[heng4,heng2]' '[hong2,gong1]'
 '[hu5,hu4]' '[hua4,hua1]' '[huan4,huan1]' '[huan5,huan1]' '[ji3,ji1]'
 '[ji5,ji3]' '[jia3,gu3]' '[jian4,jian1]' '[jie5,ji4]' '[ju1,che1]'
 '[ju4,ju1]' '[jue2,jiao4]' '[kuai4,hui4]' '[lan5,lan2]' '[li3,feng1]'
 '[ling2,ling1]' '[ling5,ling2]' '[lu4,liu4]' '[lu5,lu2]' '[luo4,jia4]'
 '[luo5,luo1]' '[lv4,lu4]' '[mai5,mai4]' '[mang2,long2]' '[mao2,mao1]'
 '[mi3,er3]' '[mi4,jiao3]' '[mo2,me5]' '[mo4,mei2]' '[mou2,miao4]'
 '[nan5,nan2]' '[nei4,na4]' '[ni2,er2]' '[ning4,ning2]' '[pan5,pan2]'
 '[pei2,fei2]' '[ping2,feng2]' '[po1,bo1]' '[pu3,pu2]' '[qi2,ji4]'
 '[qian3,ji

In [12]:
%%time
last_names = build_sparse_from_sim(a, b, c, d, e, f)

Wall time: 14.3 s


In [13]:
%%time
fname_S1 = []
for i in S1['fname.py']:
    if isinstance(i,str):
        fname_S1.append(i)
fname_S2 = []
for i in S2['fname.py']:
    if isinstance(i,str):
        fname_S2.append(i)

while len(S1) > len(fname_S1):
    fname_S1.append('0')
while len(S2) > len(fname_S2):
    fname_S2.append('0')

fname_new1 = pd.DataFrame({'fname_fixed':fname_S1})
fname_new2 = pd.DataFrame({'fname_fixed':fname_S2})

S1 = S1.join(fname_new1)
S2 = S2.join(fname_new2)

Wall time: 14 ms


In [14]:
%%time
a = np.unique(fname_S1)
b = np.unique(fname_S2)
c = fuzz.ratio
d = 85
e = unique(fname_new1).get('fname_fixed')
f = unique(fname_new2).get('fname_fixed')

Wall time: 221 ms


In [15]:
%%time
first_name = build_sparse_from_sim(a, b, c, d, e, f)

Wall time: 17min 22s


In [16]:
%%time
def exact_match(a, b):
    if a == b:
        return 1
    return 0

a = np.unique(S1['sex'].tolist())
b = np.unique(S2['sex'].tolist())
c = exact_match
d = 1
e = unique_dict_1['sex']
f = unique_dict_2['sex']

Wall time: 4.99 ms


In [17]:
%%time
sex = build_sparse_from_sim(a, b, c, d, e, f)

<class 'set'>
Wall time: 13.6 s


In [18]:
%%time
a = np.unique(S1['yob'].tolist())
b = np.unique(S2['yob'].tolist())
c = exact_match
d = 1
e = unique_dict_1['yob']
f = unique_dict_2['yob']

yobs = build_sparse_from_sim(a, b, c, d, e, f)

Wall time: 627 ms


In [19]:
%%time
a = np.unique(S1['mob'].tolist())
b = np.unique(S2['mob'].tolist())
c = exact_match
d = 1
e = unique_dict_1['mob']
f = unique_dict_2['mob']

mobs = build_sparse_from_sim(a, b, c, d, e, f)

Wall time: 2.03 s


In [20]:
%%time
a = np.unique(S1['dob'].tolist())
b = np.unique(S2['dob'].tolist())
c = exact_match
d = 1
e = unique_dict_1['dob']
f = unique_dict_2['dob']

dobs = build_sparse_from_sim(a, b, c, d, e, f)

Wall time: 712 ms


**DEMO STARTS HERE**

In [21]:
%%time
ls = []
i = 0
j = 0
for i in range(0,len(S1)):
    for j in range(0,len(S2)):
        ls.append((i,j))

Wall time: 28 s


In [22]:
%%time
lsy = []
for i in ls:
    if i in yobs:
        lsy.append(1)
    else:
        lsy.append(0)

Wall time: 42.6 s


In [23]:
%%time
lsm = []
for i in ls:
    if i in mobs:
        lsm.append(1)
    else:
        lsm.append(0)

Wall time: 50.7 s


In [24]:
%%time
lsd = []
for i in ls:
    if i in dobs:
        lsd.append(1)
    else:
        lsd.append(0)

Wall time: 45.1 s


In [25]:
%%time
lsx = []
for i in ls:
    if i in sex:
        lsx.append(1)
    else:
        lsx.append(0)

Wall time: 2min 6s
Parser   : 2.01 s


In [26]:
%%time
lsn = []
for i in ls:
    if i in last_names:
        lsn.append(1)
    else:
        lsn.append(0)

Wall time: 49.1 s


In [27]:
%%time
lsf = []
for i in ls:
    if i in first_name:
        lsf.append(1)
    else:
        lsf.append(0)

Wall time: 40.3 s


## **END**

In [28]:
%%time
df_master = pd.DataFrame({"Indices":ls, "L.names":lsn, "F.names":lsf, "Sex":lsx, "Dob":lsd, "Mob":lsm, "Yob":lsy})

Wall time: 2min 46s


In [29]:
%%time
df_master.set_index('Indices', inplace=True)
df_master

Wall time: 1.56 s


In [30]:
%%time
dfm = df_master.values
list(df_master.columns.values)

Wall time: 997 µs


In [31]:
%%time
unique_rows, counts = np.unique(dfm, return_counts = True, axis=0)
unique_rows = unique_rows.view(dfm.dtype).reshape(-1, dfm.shape[1])

dfm2 = pd.DataFrame(unique_rows, columns = [df_master.columns.values])

counts = pd.DataFrame(counts, columns = ['counts'])
print(dfm2.join(counts))

    (L.names,)  (F.names,)  (Sex,)  (Dob,)  (Mob,)  (Yob,)    counts
0            0           0       0       0       0       0  26617977
1            0           0       0       0       0       1    779136
2            0           0       0       0       1       0   2413021
3            0           0       0       0       1       1     70875
4            0           0       0       1       0       0    884844
5            0           0       0       1       0       1     26091
6            0           0       0       1       1       0     80650
7            0           0       0       1       1       1      2261
8            0           0       1       0       0       0  26612839
9            0           0       1       0       0       1    779484
10           0           0       1       0       1       0   2417676
11           0           0       1       0       1       1     70407
12           0           0       1       1       0       0    884472
13           0           0       1

In [4]:
import multiprocessing
print("Processor Core Count: ", multiprocessing.cpu_count())

Processor Core Count:  8
