### <font color=black> Below Script is for group_by fuzzy match

#### <font color=orange> Sample Example
https://stackoverflow.com/questions/12307099/modifying-a-subset-of-rows-in-a-pandas-dataframe

In [39]:
import pandas as pd 
import numpy as np 
df = pd.DataFrame({"A":[0,1,0], "B":[2,0,5]}, columns=list('AB'))
df

Unnamed: 0,A,B
0,0,2
1,1,0
2,0,5


In [40]:
df.loc[df.A == 0, 'B'] = np.nan
df

Unnamed: 0,A,B
0,0,
1,1,0.0
2,0,


#### <font color=orange> Implemented Example

In [9]:
import pandas as pd
df_rawdata = pd.read_csv("S:\\SandeepG\\Official\\DataScience\\SimpliLearn\\testdata.csv",  sep=',', engine='python', header='infer')
df_rawdata

# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

In [10]:
df_rawdata

Unnamed: 0,compId,compType
0,C1,3 comp Type
1,C2,COMP 3 TYpe
2,C3,comp 2 Type
3,C4,COMP TYpe 2
4,C5,4 COMP TYpe
5,C6,comp Type 4
6,C7,COMP tYpe 3
7,C8,CoMp Type 2
8,C9,comp Type 1
9,C10,comp TYpe 4


In [11]:
df_rawdata['compCat'] = df_rawdata['compType']

In [12]:
df_rawdata

Unnamed: 0,compId,compType,compCat
0,C1,3 comp Type,3 comp Type
1,C2,COMP 3 TYpe,COMP 3 TYpe
2,C3,comp 2 Type,comp 2 Type
3,C4,COMP TYpe 2,COMP TYpe 2
4,C5,4 COMP TYpe,4 COMP TYpe
5,C6,comp Type 4,comp Type 4
6,C7,COMP tYpe 3,COMP tYpe 3
7,C8,CoMp Type 2,CoMp Type 2
8,C9,comp Type 1,comp Type 1
9,C10,comp TYpe 4,comp TYpe 4


In [13]:
unique_compType = df_rawdata['compType'].unique()

In [14]:
unique_compType.shape

(17,)

In [15]:
unique_compType

array(['3 comp Type', 'COMP 3 TYpe', 'comp 2 Type', 'COMP TYpe 2',
       '4 COMP TYpe', 'comp Type 4', 'COMP tYpe 3', 'CoMp Type 2',
       'comp Type 1', 'comp TYpe 4', 'COMP Type 1', '4 COMP Type',
       '4COMP Type ', '1 COMP TYpe', 'comp 4 Type ', 'comp TYpe 2',
       'CoMp Type 4'], dtype=object)

In [48]:
comptype3_sort_variants = process.extract("3 comp Type", unique_compType, limit=100, scorer=fuzz.token_sort_ratio)
comptype3_set_variants = process.extract("3 comp Type", unique_compType, limit=100, scorer=fuzz.token_set_ratio)

print('\ncomptype3_sort_variants', comptype3_sort_variants)
print('\ncomptype3_set_variants', comptype3_set_variants)

matchthreshold = 100
comptype3_sort_var_selected = [comptype3_sort_variant for comptype3_sort_variant in comptype3_sort_variants if comptype3_sort_variant[1] == matchthreshold]
comptype3_set_var_selected = [comptype3_set_variant for comptype3_set_variant in comptype3_set_variants if comptype3_set_variant[1] == matchthreshold]

print('\ncomptype3_sort_var_selected', comptype3_sort_var_selected)
print('\ncomptype3_set_var_selected', comptype3_set_var_selected)

comptype3_var_selected = comptype3_sort_var_selected + comptype3_set_var_selected

print('\ncomptype3_var_selected', comptype3_var_selected)

print('\ncomptype3_var_selected[0][0]', comptype3_var_selected[0][0])

print('\ncomptype3_var_selected', set(comptype3_var_selected))


comptype3_sort_variants [('3 comp Type', 100), ('COMP 3 TYpe', 100), ('COMP tYpe 3', 100), ('comp 2 Type', 91), ('COMP TYpe 2', 91), ('4 COMP TYpe', 91), ('comp Type 4', 91), ('CoMp Type 2', 91), ('comp Type 1', 91), ('comp TYpe 4', 91), ('COMP Type 1', 91), ('4 COMP Type', 91), ('1 COMP TYpe', 91), ('comp 4 Type ', 91), ('comp TYpe 2', 91), ('CoMp Type 4', 91), ('4COMP Type ', 86)]

comptype3_set_variants [('3 comp Type', 100), ('COMP 3 TYpe', 100), ('COMP tYpe 3', 100), ('comp 2 Type', 91), ('COMP TYpe 2', 91), ('4 COMP TYpe', 91), ('comp Type 4', 91), ('CoMp Type 2', 91), ('comp Type 1', 91), ('comp TYpe 4', 91), ('COMP Type 1', 91), ('4 COMP Type', 91), ('1 COMP TYpe', 91), ('comp 4 Type ', 91), ('comp TYpe 2', 91), ('CoMp Type 4', 91), ('4COMP Type ', 86)]

comptype3_sort_var_selected [('3 comp Type', 100), ('COMP 3 TYpe', 100), ('COMP tYpe 3', 100)]

comptype3_set_var_selected [('3 comp Type', 100), ('COMP 3 TYpe', 100), ('COMP tYpe 3', 100)]

comptype3_var_selected [('3 comp Ty

In [34]:
comptype3_catName = comptype3_var_selected[0][0]

In [35]:
comptype3_catName

'3 comp Type'

In [38]:
comptype3_var_selected

[('3 comp Type', 100),
 ('COMP 3 TYpe', 100),
 ('COMP tYpe 3', 100),
 ('3 comp Type', 100),
 ('COMP 3 TYpe', 100),
 ('COMP tYpe 3', 100)]

In [63]:
matchthreshold = 100
for comptype in unique_compType:
    
    print('\n\n--------------')
    print('comptype : ', comptype)
    
    # fuzzy sort search
    comptype_sort_variants = process.extract(comptype, unique_compType, limit=5, scorer=fuzz.token_sort_ratio)
    # fuzzy set search
    comptype_set_variants = process.extract(comptype, unique_compType, limit=5, scorer=fuzz.token_set_ratio)

    # filter out fuzzy search below threshold value
    comptype_sort_var_selected = [comptype_sort_variant for comptype_sort_variant in comptype_sort_variants if comptype_sort_variant[1] == matchthreshold]
    comptype_set_var_selected = [comptype_set_variant for comptype_set_variant in comptype_set_variants if comptype_set_variant[1] == matchthreshold]

    # combine the filtered comptype
    comptype_var_selected = comptype_sort_var_selected + comptype_set_var_selected

    # use the first value to assign category
    comptype_catName = comptype_var_selected[0][0]
    
    for comptype_var in comptype_var_selected:
       
        #print('df_rawdata.compCat : ' , df_rawdata.compCat)
        print('comptype_var : ', comptype_var[0])
        print('comptype_catName : ', comptype_catName)
        #df.loc[selection criteria, columns I want] = value
        df_rawdata.loc[df_rawdata.compCat == comptype_var[0], 'compCat'] = comptype_catName
    



--------------
comptype :  3 comp Type
comptype_var :  3 comp Type
comptype_catName :  3 comp Type
comptype_var :  COMP 3 TYpe
comptype_catName :  3 comp Type
comptype_var :  COMP tYpe 3
comptype_catName :  3 comp Type
comptype_var :  3 comp Type
comptype_catName :  3 comp Type
comptype_var :  COMP 3 TYpe
comptype_catName :  3 comp Type
comptype_var :  COMP tYpe 3
comptype_catName :  3 comp Type


--------------
comptype :  COMP 3 TYpe
comptype_var :  3 comp Type
comptype_catName :  3 comp Type
comptype_var :  COMP 3 TYpe
comptype_catName :  3 comp Type
comptype_var :  COMP tYpe 3
comptype_catName :  3 comp Type
comptype_var :  3 comp Type
comptype_catName :  3 comp Type
comptype_var :  COMP 3 TYpe
comptype_catName :  3 comp Type
comptype_var :  COMP tYpe 3
comptype_catName :  3 comp Type


--------------
comptype :  comp 2 Type
comptype_var :  comp 2 Type
comptype_catName :  comp 2 Type
comptype_var :  COMP TYpe 2
comptype_catName :  comp 2 Type
comptype_var :  CoMp Type 2
comptype_

In [67]:
df_rawdata.compType.unique()

array(['3 comp Type', 'COMP 3 TYpe', 'comp 2 Type', 'COMP TYpe 2',
       '4 COMP TYpe', 'comp Type 4', 'COMP tYpe 3', 'CoMp Type 2',
       'comp Type 1', 'comp TYpe 4', 'COMP Type 1', '4 COMP Type',
       '4COMP Type ', '1 COMP TYpe', 'comp 4 Type ', 'comp TYpe 2',
       'CoMp Type 4'], dtype=object)

In [65]:
df_rawdata.compCat.unique()

array(['3 comp Type', 'comp 2 Type', '4 COMP TYpe', 'comp Type 1',
       '4COMP Type ', 'CoMp Type 4'], dtype=object)

In [68]:
df_rawdata

Unnamed: 0,compId,compType,compCat
0,C1,3 comp Type,3 comp Type
1,C2,COMP 3 TYpe,3 comp Type
2,C3,comp 2 Type,comp 2 Type
3,C4,COMP TYpe 2,comp 2 Type
4,C5,4 COMP TYpe,4 COMP TYpe
5,C6,comp Type 4,4 COMP TYpe
6,C7,COMP tYpe 3,3 comp Type
7,C8,CoMp Type 2,comp 2 Type
8,C9,comp Type 1,comp Type 1
9,C10,comp TYpe 4,4 COMP TYpe
