## Install the libraries

In [None]:
# Manual snippet to upgrade any package IF the code fails to update by itself 
#!pip install --upgrade openpyxl

In [1]:
import sys
import subprocess

# List of libraries:
list_lib = ['googletrans==3.1.0a0','python-Levenshtein','fuzzywuzzy','gTTS','unidecode']
upd_lib = ['openpyxl']

for lib in upd_lib:
  subprocess.check_call([sys.executable, '-m', 'pip', 'install','--upgrade', lib])

# implement pip as a subprocess:
for lib in list_lib:
  subprocess.check_call([sys.executable, '-m', 'pip', 'install',lib])

# process output with an API in the subprocess module:
reqs = subprocess.check_output([sys.executable, '-m', 'pip',
'freeze'])

for lib in list_lib:
  installed_packages = [r.decode().split('==')[0] for r in reqs.split()]
  if lib in installed_packages:
    print(lib)

for lib in upd_lib:
  installed_packages = [r.decode().split('==')[0] for r in reqs.split()]
  if lib in installed_packages:
    print(lib)




python-Levenshtein
fuzzywuzzy
gTTS
openpyxl


## Import the libraries

In [2]:
#  ========================================= Basic Imports ====================================================================== 
import pandas as pd
import os
import io
import sys
from datetime import datetime, timedelta
import re
import glob
import numpy as np
import ssl
import warnings
from google.colab import drive
from google.colab import files
import chardet
import unicodedata
from unidecode import unidecode

warnings.filterwarnings("ignore")

#========================================= Processing Bar notification ====================================================
from googletrans  import Translator
translator = Translator(service_urls = ['translate.googleapis.com']) 

#========================================= Processing Bar notification ====================================================
from tqdm.notebook import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#========================================= Model Imports ==================================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler,MinMaxScaler,MaxAbsScaler

#========================================= For sound notification =========================================================
from gtts import gTTS

    

In [3]:
process_start_time =  datetime.now()
scaling = MaxAbsScaler
tfidf_transformer = TfidfTransformer()
count_vect = CountVectorizer()
SGD = SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, learning_rate='constant',eta0 = 0.1)



### All definations

In [4]:
def reading_master_files():
  uploads = files.upload()
  tot_files = len(uploads.keys())

  if tot_files>1:
    frame=[]
    total = 0
    for fn in uploads.keys():
        file = pd.read_excel(io.BytesIO(uploads[fn]))
        rep_col = 'Predicted Names/ Model Cleansed'
        rep_col_dict = {'Predicted Names/ Model Cleansed':'New Supplier'} # new supplier as manually harmonized names/previously harmonized/model cleansed
        for col in file.columns:
            if col == rep_col:
                file.rename(columns = rep_col_dict, inplace = True)
        file = file[['DataSource','Vendor number','Vendor name', 'New Supplier']]
        dimen = file.shape
        nr = dimen[0]
        nc = dimen[1]
        total=total+nr
        file.reset_index(inplace=True,drop=True)
        frame.append(file)
    consolidated_df = pd.concat(frame,sort=False)
    consolidated_df.reset_index(inplace=True,drop=True)
    raw_df = consolidated_df.copy()

    for fn in uploads.keys():
      with open(fn, 'rb') as rawdata:
        result = chardet.detect(rawdata.read(18))
        print(f'Filename: {fn}\nInfo: {result} \n')

  else:
    filename = uploads.keys()
    filename = str(filename)
    filename = re.findall(r"\'.*?\'", filename)[0]
    filename = re.sub("'","",filename)
    with open(filename, 'rb') as rawdata:
      result = chardet.detect(rawdata.read(18))
      print(f'Filename: {filename}\nInfo: {result} \n')
    file = pd.read_excel(io.BytesIO(uploads[filename]))
    dimen = file.shape
    nr = dimen[0]
    nc = dimen[1]
    total=nr
    file = file[['DataSource','Vendor number','Vendor name', 'New Supplier']]
    raw_df = file.copy()

  return(raw_df,total,tot_files)


def vendor_harmonized_stop_word(name):
    s = " ".join([i for i in re.split(r' ', name) if not i in vendor_harmonized_stopwords]).strip()
    return s


def trim_spaces(name):
#    s = " ".join([i for i in re.split(r' ', name)]).strip()
#    return s
    s = " ".join(name.split()).strip()
    return s


def new_old_match_stopword(name):
    s = "".join([i for i in re.split(r' ', name) if not i in new_old_match_stopwords]).strip()
    return s


def Clean_Vendor_Name_stopword(name):
    s = " ".join([i for i in re.split(r' ', name) if not i in new_old_match_stopwords]).strip()
    return s

    

def rep(name): # same function name has been used twice recheck the dependency and change accordingly
    return(name.replace("`","",1))


def norm_vendorNames(vname): # same function name has been used twice recheck the dependency and change accordingly
  normalized = unicodedata.normalize('NFD', vname)
  norm_vname = u"".join([c for c in normalized if not unicodedata.combining(c)])
  return(norm_vname)


def decode_vnames(vname):
  return(unidecode(vname))


def data_clean(file):
    proc_df = file
    proc_df['Vendor name'] = proc_df['Vendor name'].apply(lambda x: str(x)) # added this line to remove suhong type of city names from vendor names. Do the same with other defs 
    proc_df= final_output_sup_name_Cleaning(proc_df,'Vendor name')  # added function to remove city names
    proc_df['Vendor name'] = proc_df['Vendor name'].apply(decode_vnames)
    proc_df['Clean_Vendor_Name'] = proc_df['Vendor name'].apply(lambda x: str(x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: x.replace('DO NOT USE', ''))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(decode_vnames)
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df= final_output_sup_name_Cleaning(proc_df,'Clean_Vendor_Name') # added function to remove city names
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub('[#,.,/,]', ' ',x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: x.lower())
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(vendor_harmonized_stop_word)
    
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(Clean_Vendor_Name_stopword) # added this new line
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(trim_spaces) # added this new line

    proc_df['New Supplier'] = proc_df['New Supplier'].apply(decode_vnames)
    
    return proc_df


def data_clean_initials(file):
    proc_df = file
    proc_df['Vendor_name_Initials'] = proc_df['Clean_Vendor_Name'].apply(lambda x: str(x))
    #proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    #print()
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: x[0])
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: re.sub('[@,?,#,(,),*,+,.,-,_,`,~]', ' ',x))
    #proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(vendor_harmonized_stop_word)
    
    proc_df['New_Supplier'] = proc_df['New Supplier'].apply(lambda x: str(x))
    proc_df['New_Supplier'] = proc_df['New Supplier'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub('[@?#()*_~]', '',x))
    proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: x.lower())
    proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub('"', '',x))
    proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub("'", '',x))
    #proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub(r'-', ' ', x))
    #proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub(r'[^\s]', '', x))
    proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(vendor_harmonized_stop_word) 
    return proc_df



def input_df_data_clean(file):
    proc_df = file
    
    # Below steps are the cleaning for vendors:
    proc_df['Vendor name'] = proc_df['Vendor name'].apply(lambda x: str(x))  
    proc_df= final_output_sup_name_Cleaning(proc_df,'Vendor name')
    proc_df['Vendor name'] = proc_df['Vendor name'].apply(decode_vnames)
    proc_df['Clean_Vendor_Name'] = proc_df['Vendor name'].apply(lambda x: str(x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(decode_vnames)
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub('[#,.,/,]', ' ',x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(vendor_harmonized_stop_word)
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df= final_output_sup_name_Cleaning(proc_df,'Clean_Vendor_Name')
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: x.lower())
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(Clean_Vendor_Name_stopword)# added this line extra
    proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(trim_spaces)
    proc_df.replace("", np.nan, inplace=True) #try with commenting this line and below too to check the difference
    proc_df.dropna(subset = ["Clean_Vendor_Name"],inplace=True)
    return proc_df


def crisp_vendor_name(file):
    proc_df = file
    proc_df['crisp_vendor_name'] = proc_df['Clean_Vendor_Name'].apply(lambda x: str(x))
    #proc_df['crisp_vendor_name'] = proc_df['crisp_vendor_name'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    #proc_df['crisp_vendor_name'] = proc_df['crisp_vendor_name'].apply(lambda x: re.sub(' ', '',x))
    proc_df['crisp_vendor_name'] = proc_df['crisp_vendor_name'].apply(new_old_match_stopword)
    return proc_df


   
def input_df_data_ini(file):
    proc_df = file
    proc_df['Vendor_name_Initials'] = proc_df['Clean_Vendor_Name'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: x[0])
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df['Vendor_name_Initials'] = proc_df['Vendor_name_Initials'].apply(lambda x: re.sub('[@,?,#,(,),*,+,.,-,_,`,~]', ' ',x))
    #proc_df['Clean_Vendor_Name'] = proc_df['Clean_Vendor_Name'].apply(vendor_stop_word)
    return proc_df


def inp_CleaningChamber(file):
    proc_df = file
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor name'].apply(lambda x: str(x))
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    proc_df= final_output_sup_name_Cleaning(proc_df,'Vendor_Name_cleansed') # Added function to remove city names
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: re.sub('[@?#()*_~/]', ' ',x))
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: x.lower())
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: re.sub('"', '',x))
    #proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: re.sub("'", '',x))
    #proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(lambda x: re.sub(r'-', ' ', x))
    #proc_df['New_Supplier'] = proc_df['New_Supplier'].apply(lambda x: re.sub(r'[^\s]', '', x))
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(vendor_harmonized_stop_word)
    proc_df['Vendor_Name_cleansed'] = proc_df['Vendor_Name_cleansed'].apply(trim_spaces)
    return proc_df


def bucket_df(df,initials_lists):
    bucket_df_list =[]
    grp_df = df.groupby(df.Vendor_name_Initials)
    for char in initials_lists:
        bucket_df_list.append(grp_df.get_group(char))
    return(bucket_df_list)


def bucket_df_dup(dframe,unique_counts):
    bucket_df_list =[]
    grp_df = dframe.groupby(dframe['#VN'])
    for char in unique_counts:
        bucket_df_list.append(grp_df.get_group(char))
    return(bucket_df_list)



def time_util(pred_time):
    # total_seconds = pred_time.seconds
    seconds = int(pred_time.total_seconds())
    secs_in_a_min = 60
    minutes, seconds = divmod(seconds, secs_in_a_min)

    time_fmt = f"{minutes:02d} {seconds:02d} secs"

    if minutes > 0:
        suffix = 's' if minutes > 1 else ""
        return f'{minutes} min{suffix} & {seconds} secs'

    return time_fmt


def sgd_clf(i):
    Tot_St_Time = datetime.now()
    grp_number = np.unique(grp_clean_main_df_list[i]['Vendor_name_Initials'])[0]
    Train_X,Test_X,Train_Y,Test_Y = train_test_split(grp_clean_main_df_list[i]['Clean_Vendor_Name'],grp_clean_main_df_list[i]['New_Supplier'],test_size=0.00001, random_state=0)
    X_train_counts = count_vect.fit_transform(Train_X)
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    sgd_clf=SGD.fit(X_train_tfidf,Train_Y)
    Tot_End_Time = datetime.now()
    Tot_time_taken = Tot_End_Time - Tot_St_Time
    return(time_util(Tot_time_taken),grp_number,sgd_clf)

def Match_wordLength(i):
    if (merged_final['Model_pred_Supplier'][i] == merged_final['Prev_Predicted_Supplier'][i]):
        merged_final['Match_word_len'].loc[i] = 'Same Length'
    else:
        merged_final['Match_word_len'].loc[i] = 'Different Length'
    return(merged_final['Match_word_len'])

def rep(name):
    return(name.replace("-","",1))


def trans_name(name):
  srch = re.search("([^\x00-\x7F])+",name)==None
  if srch:
    return(name)
  else:
    return(translator.translate(str(name), dest = 'en').text)


punc = '''!(-[](){};`:)/'"\,<>.?@#$%^*+_~'''  
invalid_punc = ['*','#']
def rem_first_char_punc(name):  # check this whether it is implemented or not
    count = 0
    frame=[]
    for i in range(len(name)):
        if name[i] in punc:
            frame.append(name[i])
            #print(name[i])
            count = count+1
    unq_name_punc = np.unique(frame)
    unq_name_punc = unq_name_punc.tolist()
    for i in range(len(invalid_punc)):
        if invalid_punc[i] in unq_name_punc:
            name=name.replace(invalid_punc[i],'')

    limit = 0
    for i in range(len(name)):
        if name[i] not in punc:
            #print(name[i])
            break
        else:
            #print(name[i])
            limit = limit + 1
            #break
    upper_limit = limit  
    #print(upper_limit) 
    for i in range(upper_limit):
        if name[0] in punc:
            #print(f'the found punctuation is "{name[0]}"')
            name=name.replace(name[0],'',1)
    return(name)


def ds_cleansing(name):
  #print(type(name))
  if type(name) is np.str:
    punc_list = [',','.']
    for c in name:
        if c not in punc_list:
            pass
        else:
            split_string = re.split(f"[{c}]", name)
            name = split_string[len(split_string)-1].strip()
  else:
    pass
  return(name)


def rep_harmonized_sup_name(name):
  #name = str(name)
  for i in re.split(r' ', name):
    if str.upper(i) in supplier_stopwords:
      idx = name.index(i)
      if idx == 0:
        return(name.replace(i,"",1))
    else:
      return(name)


def final_output_sup_name_Cleaning(file,column_name):
    proc_df = file.copy()
    proc_df[column_name]= proc_df[column_name].apply(rep_harmonized_sup_name)
    return(proc_df)



def vendorFlag_wo_VN(i):
    #if (input_df['Clean_Vendor_Name'].iloc[i] in clean_main_df['Clean_Vendor_Name']):
    if (input_df['Clean_Vendor_Name'][i] in clean_main_df['Clean_Vendor_Name'].values)|(input_df['crisp_vendor_name'][i] in clean_main_df['crisp_vendor_name'].values):
      input_df['Vendor_Flag'].iloc[i] = 'Old'
    #elif (input_df['crisp_Vendor_Name'][i] in clean_main_df['crisp_Vendor_Name'].values):
    #  input_df['Vendor_Flag'].iloc[i] = 'Old'
    else:
      input_df['Vendor_Flag'].iloc[i] = 'New'
    return(input_df)



def vendorFlag(i): # New logic provided Daniel
    #if (input_df['Clean_Vendor_Name'].iloc[i] in clean_main_df['Clean_Vendor_Name']):
    if ((input_df['Clean_Vendor_Name'][i] in clean_main_df['Clean_Vendor_Name'].values) and\
        (input_df['DataSource'][i] in clean_main_df['DataSource'].values) and\
        (input_df['Vendor number'][i] in clean_main_df['Vendor number'].values))\
    or ((input_df['crisp_vendor_name'][i] in clean_main_df['crisp_vendor_name'].values) and (input_df['DataSource'][i] in clean_main_df['DataSource'].values) and (input_df['Vendor number'][i] in clean_main_df['Vendor number'].values)):
      #if(input_df['DataSource'][i] in clean_df['DataSource'].values) and (input_df['Vendor number'][i] in clean_df['Vendor number'].values):
      input_df['Vendor_Flag'].iloc[i] = 'Old'
    #elif (input_df['crisp_Vendor_Name'][i] in clean_main_df['crisp_Vendor_Name'].values):
    #  input_df['Vendor_Flag'].iloc[i] = 'Old'
    else:
      input_df['Vendor_Flag'].iloc[i] = 'New'
    return(input_df)


def lingual_check(file):
  proc_df = file
  for n in proc_df[proc_df.columns[0]]:
#  #print(n)
    srch = re.search("([^\x00-\x7F])+",n)==None
    if srch:
      print('no match')
      break
    else:
      print('There are other format of languages present in the file')
      break



# Drop unwanted column names by giving the list of column names in  a list
def drop_cols_df(col_list,dataframe):
  for col in dataframe.columns:
    if col not in col_list:
      dataframe.drop(columns=col,axis=1,inplace=True)
                

# For checking whether there are any duplicates or not

def dup_count(check_dataframe):
  dup_count= check_dataframe[check_dataframe.duplicated()==True].count().unique()[0]
  dup_dataframe = check_dataframe[check_dataframe.duplicated()==True]

  return(dup_count,dup_dataframe)


def check_dup_count(check_df):
  count_dup, dup_df = dup_count(check_df)
  #globals()['count_duplicates'] = count_dup
  if count_dup > 1 and count_dup < 8:
      print('Total Duplicates in Dataset: ',count_dup)
  else:
    print('Total Duplicates in Dataset: ',count_dup)

  return(count_dup,dup_df)


# To count foreign names
def count_foreign(df,col_name):
  count = 0
  for n in df[col_name]:
    srch = re.search("([^\x00-\x7F])+",n)==None
    if srch:
      pass
    else:
      count = count + 1
  #print(f'Other name formats in "new supplier" column : {count}')
  return(count)



def unique_vendor_flag(n):
  if n==2:
    x,y = input_df.Vendor_Flag.unique()
    return(x,y)
  else:
    return(input_df.Vendor_Flag.unique())




In [5]:
master_file,length,no_of_files = reading_master_files()

Saving Master data.xlsx to Master data.xlsx
Filename: Master data.xlsx
Info: {'encoding': None, 'confidence': 0.0, 'language': None} 



In [8]:
print('User uploaded master file with {length} records'.format(length=length))

User uploaded master file with 228994 records


In [9]:
# Master file
master_file.head(4)

Unnamed: 0,DataSource,Vendor number,Vendor name,New Supplier
0,68. 3rd Party Security,6412,� Leeuwenburgh Mark,LEEUWENBURGH MARK
1,10. SAP C11,30117052,"""ANTIESTATICA DE MEXICO, SA de CV""",ANTIESTATICA DE MEXICO
2,15. SAP SP1,102156,"""Le Bleu, Phillip""",LE BLEU PHILLIP
3,12. SAP SEP,110302,# 3W # DO NOT USE,MICROSOFT


In [10]:
raw_df = master_file.copy()
raw_df.drop_duplicates(inplace=True) # Drop duplicates for the combination of DataSource, Vendor number nd Vendor name
#raw_df['Vendor name'].drop_duplicates(inplace=True)
raw_df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107913 entries, 0 to 228993
Columns: 4 entries, DataSource to New Supplier
dtypes: object(4)
memory usage: 4.1+ MB


#### Dropping the null rows:

In [11]:
# Use raw_df to get vendor number and DataSource at the end. So copy only Vendor name and new supplier to df
df = raw_df.copy()
df.replace("", np.nan, inplace=True)
df.dropna(inplace=True)  # Here is first drop NA
df.drop_duplicates(inplace=True) # Dropping duplicates
#df['Vendor name'].drop_duplicates(inplace=True) # Dropping duplicates Distinguish between NA and duplicates
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107854 entries, 0 to 228993
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   DataSource     107854 non-null  object
 1   Vendor number  107854 non-null  object
 2   Vendor name    107854 non-null  object
 3   New Supplier   107854 non-null  object
dtypes: object(4)
memory usage: 4.1+ MB


In [12]:
#df[df['Vendor name'].str.contains('Suzhou')]

In [13]:
after_drop_na_1 = df.shape[0]

In [14]:
no_of_null_records1= length - after_drop_na_1
no_of_null_records1

121140

# Analysis of minimum injection of duplicate vendor names

In [15]:
df['Vendor name'].nunique()

93389

In [16]:
df_dup_analysis = df['Vendor name'].value_counts()
df_dup_analysis

EMPLOYEE                                                     233
ULINE                                                         35
GRAINGER                                                      29
AT&T                                                          22
MOTION INDUSTRIES INC                                         18
                                                            ... 
GOWLING STRATHY & HENDERSON/ATTN 160 ELGIN ST  SUITE 2600      1
Gowin Mechanical, Inc. dba Mid-South Metal Works               1
GOWAN MOTORS NAVAN ROAD LTD                                    1
GOWAN MOTORS NAVAN ROAD                                        1
龙岩市财富贸易有限公司                                                    1
Name: Vendor name, Length: 93389, dtype: int64

In [17]:
df_dup_analysis = pd.DataFrame(df_dup_analysis)

In [18]:
df_dup_analysis.reset_index(inplace=True)
df_dup_analysis.rename(columns = {'index':'Vendor name','Vendor name': '#VN'}, inplace = True)


In [19]:
df_dup_analysis[df_dup_analysis['Vendor name']=='SMITH MACHINERY COMPANY']

Unnamed: 0,Vendor name,#VN
47971,SMITH MACHINERY COMPANY,1


In [20]:
df_dup_analysis[df_dup_analysis['#VN']>=4].median() # this shows that each sample should have minimum 4 dummy entries. 

# Now write a function to increase the Master data dynamically only that group where the values are less than 4

#VN    4.0
dtype: float64

In [21]:
df_dup_analysis_less_4 = df_dup_analysis[df_dup_analysis['#VN']<4]

In [22]:
df_dup_analysis_less_4

Unnamed: 0,Vendor name,#VN
632,UNIVERSAL PUNCH CORP,3
633,RDO EQUIPMENT CO,3
634,Genetec Inc.,3
635,BOB STEPHENS & ASSOCIATES,3
636,WESCO AIRCRAFT,3
...,...,...
93384,GOWLING STRATHY & HENDERSON/ATTN 160 ELGIN ST ...,1
93385,"Gowin Mechanical, Inc. dba Mid-South Metal Works",1
93386,GOWAN MOTORS NAVAN ROAD LTD,1
93387,GOWAN MOTORS NAVAN ROAD,1


In [23]:
df_dup_analysis_less_4_list = df_dup_analysis_less_4['#VN'].unique()
df_dup_analysis_less_4_list = df_dup_analysis_less_4_list.tolist()

In [24]:
df_dup_analysis_less_4_list[0]

3

In [25]:
#bucket_df_dup(df_dup_analysis_less_4,df_dup_analysis_less_4_list)  # Crashing

In [26]:
df_dup_analysis_less_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92757 entries, 632 to 93388
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Vendor name  92757 non-null  object
 1   #VN          92757 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


In [27]:
#df_dup_analysis_less_4 = df_dup_analysis_less_4.astype({'#VN':str})

In [28]:
#df_dup_analysis_less_4.info()

In [29]:
pd.concat([grp_test.get_group(3)]*2,ignore_index=True)

NameError: ignored

In [None]:
df_dup_analysis_less_4_list[1]

In [30]:
grp_test = df_dup_analysis_less_4.groupby(by = df_dup_analysis_less_4['#VN'])
dup_frame = []
for i in range(len(df_dup_analysis_less_4_list)):
  #print(i)
  n_times = (4-(i+1))
  print(n_times)
  temp_df = pd.concat([grp_test.get_group(i+1)]*n_times,ignore_index=True)
  dup_frame.append(temp_df)

final_dup_df = pd.concat(dup_frame)
final_dup_df.reset_index(drop=True, inplace=True)


3
2
1


In [31]:
final_dup_df

Unnamed: 0,Vendor name,#VN
0,Polymed Ltd,1
1,POLI PMI INDUSTRIA E COMERCIO LTDA,1
2,Pipe Line Contractors Association/of Canada,1
3,PERFUR REI PERFURACOES LTDA ME,1
4,RITA ANDERSON,1
...,...,...
266692,"ROCKFORM CARBIDE MANUFACTURING, INC.",3
266693,GORE FREIGHT COMPANY LLC,3
266694,MOUNTZ INC,3
266695,L&T TECHNOLOGY SERVICES LIMITED,3


In [32]:
#from google.colab import data_table
#data_table.DataTable(df_dup_analysis_less_4)

In [33]:
ds_frame = []
for i in range(final_dup_df.shape[0]):
  ds_frame.append(f'DS_{i}')

vn_frame = []
for i in range(final_dup_df.shape[0]):
  vn_frame.append(f'VN_{i}')

In [34]:
final_dup_df.insert(0,'DataSource',ds_frame)
final_dup_df.insert(1,'Vendor number',vn_frame)


In [35]:
final_dup_df[final_dup_df['#VN'] == 2]

Unnamed: 0,DataSource,Vendor number,Vendor name,#VN
246816,DS_246816,VN_246816,J & M Plating,2
246817,DS_246817,VN_246817,NEOPOST FRANCE,2
246818,DS_246818,VN_246818,PROWIT S.C.,2
246819,DS_246819,VN_246819,J & L TURNING INC/DBA ALLIANCE BROACH & TOOL,2
246820,DS_246820,VN_246820,BLACK BOX NETWORK SERVICES NV,2
...,...,...,...,...
265603,DS_265603,VN_265603,ANTALIS FRANCE,2
265604,DS_265604,VN_265604,Key Equipment Finance,2
265605,DS_265605,VN_265605,SEMA-PRINT SP. Z O.O. SP. K.,2
265606,DS_265606,VN_265606,FABORY CENTRES,2


In [36]:
final_dup_df[final_dup_df['Vendor name'] == 'ANTALIS FRANCE']

Unnamed: 0,DataSource,Vendor number,Vendor name,#VN
256207,DS_256207,VN_256207,ANTALIS FRANCE,2
265603,DS_265603,VN_265603,ANTALIS FRANCE,2


In [37]:
final_dup_df.drop(columns=['#VN'],axis=1,inplace=True)

In [38]:
final_dup_df

Unnamed: 0,DataSource,Vendor number,Vendor name
0,DS_0,VN_0,Polymed Ltd
1,DS_1,VN_1,POLI PMI INDUSTRIA E COMERCIO LTDA
2,DS_2,VN_2,Pipe Line Contractors Association/of Canada
3,DS_3,VN_3,PERFUR REI PERFURACOES LTDA ME
4,DS_4,VN_4,RITA ANDERSON
...,...,...,...
266692,DS_266692,VN_266692,"ROCKFORM CARBIDE MANUFACTURING, INC."
266693,DS_266693,VN_266693,GORE FREIGHT COMPANY LLC
266694,DS_266694,VN_266694,MOUNTZ INC
266695,DS_266695,VN_266695,L&T TECHNOLOGY SERVICES LIMITED


In [39]:
# Now get the New supplier info from the main dataframe by join
final_dup_df = pd.merge(final_dup_df,df[['Vendor name','New Supplier']],how='left',on= 'Vendor name')


In [40]:
df_final = [df,final_dup_df]
df = pd.concat(df_final)
df.reset_index(drop=True, inplace=True)

In [41]:
df['Vendor name'].value_counts().median()  # Before the median was: 1, now after resampling median: 4

4.0

In [42]:
df['DataSource']=df['DataSource'].apply(ds_cleansing)

In [43]:
df.sample(5)

Unnamed: 0,DataSource,Vendor number,Vendor name,New Supplier
27512,Infor LX - CLVD,48803,DTS FLUID POWER LLC,DTS FLUID POWER
150721,DS_42867,VN_42867,CHAMBERS OWNERS ASSOCIATION,CHAMBERS OWNERS ASSOCIATION
179923,DS_72069,VN_72069,FABERTECK AUTOMACAO LTDA,FABRICIO ADRIA ME
157943,DS_50089,VN_50089,ALTIDO LONDON LTD,ALTIDO LONDON LTD
230058,DS_122204,VN_122204,CIREK KONSULT AB,CIREK KONSULT AB


In [44]:
df['DataSource'].unique()

array(['3rd Party Security', 'SAP C11', 'SAP SP1', ..., 'DS_266694',
       'DS_266695', 'DS_266696'], dtype=object)

### Defined set of stopwords:

In [45]:
vendor_harmonized_stopwords = ['me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
                "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
                'yourselves', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
                'herself', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', #'it',
                'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
                'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                'had', 'having', 'do', 'does', 'did', 'doing','an', #'the', 'and',
                'but','if', 'or', 'because', 'until', 'while', 'at', 'by', 'with', #'for','of', 
                'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
                'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
                'why', 'how', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
                'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
                'very', 'can', 'just', 'don', "don't", 'should', "should've",
                'now', 'll', 're', 've', 'ain', 'aren', "aren't", 'couldn',
                "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
                "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
                'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
                'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'dr','use' #'co',
                #'company', 'ltd', 'co ltd', 'company ltd', 'inc', 'llc', 'lp', 'corp', 'corporation',
                ]#'a','d', 's','i','m', 't', 'o', 'y','he','all','will', 'as'



In [46]:
new_old_match_stopwords = ['co', 'company', 'ltd', 'co ltd', 'company ltd', 'inc', 'llc', 'lp', 'corp', 'corporation','ltl','ftl']

In [47]:
supplier_stopwords = ['ZHEJIANG','ZHANGZHOU','ZHONGSHAN','QINGDAO','QIDONG','SUZHOU'] # all words should be in upper case

In [48]:
df['Vendor name'] = df['Vendor name'].apply(rem_first_char_punc)  # try to implement for input data also IF NOT
df['New Supplier'] = df['New Supplier'].apply(rem_first_char_punc)
df.drop_duplicates(inplace=True)
#df['Vendor name'].drop_duplicates(inplace=True)  #

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374732 entries, 0 to 395518
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   DataSource     374732 non-null  object
 1   Vendor number  374732 non-null  object
 2   Vendor name    374732 non-null  object
 3   New Supplier   374732 non-null  object
dtypes: object(4)
memory usage: 14.3+ MB


In [50]:
sh_df = df.shape[0]
sh_df

374732

In [51]:
# Below code is to verify whether are there any names apart from english lingual
count = 0
for n in df['New Supplier']:
  srch = re.search("([^\x00-\x7F])+",n)==None
  if srch:
    pass
  else:
    count = count + 1
print(f'Foreign name formats in "new supplier" column : {count}')



count = 0
for n in df['Vendor name']:
  srch = re.search("([^\x00-\x7F])+",n)==None
  if srch:
    pass
  else:
    count = count + 1
print(f'Foreign name formats in "Vendor name" column : {count}')


Foreign name formats in "new supplier" column : 7294
Foreign name formats in "Vendor name" column : 13764


In [52]:
df.head(2)

Unnamed: 0,DataSource,Vendor number,Vendor name,New Supplier
0,3rd Party Security,6412,� Leeuwenburgh Mark,LEEUWENBURGH MARK
1,SAP C11,30117052,"ANTIESTATICA DE MEXICO, SA de CV""",ANTIESTATICA DE MEXICO


In [53]:
## This snippet will print the name of the Column IF there are any NULL records in that Column
if (df.isna().sum().unique()[0] != 0):
  for i in df.columns:
    if df[i].isna().sum() !=0:
      print(i + '\n')
    else:
      pass
else:
  pass

In [54]:
clean_df = data_clean(df)
clean_df['Clean_Vendor_Name'] = clean_df['Clean_Vendor_Name'].apply(trim_spaces)
clean_df.replace("", np.nan, inplace=True)
clean_df.dropna(subset = ["Clean_Vendor_Name","Vendor name"],inplace=True)
clean_df = data_clean_initials(clean_df)
clean_df = crisp_vendor_name(clean_df)

clean_df['DataSource'] = clean_df['DataSource'].apply(lambda x: str(x))
clean_df['Vendor number'] = clean_df['Vendor number'].apply(lambda x: str(x))
clean_df['DataSource'] = clean_df['DataSource'].apply(trim_spaces)
clean_df['Vendor number'] = clean_df['Vendor number'].apply(trim_spaces)


clean_df= clean_df[['DataSource', 'Vendor number', 'Vendor name','Clean_Vendor_Name','crisp_vendor_name','New Supplier','New_Supplier','Vendor_name_Initials']]
clean_df.reset_index(inplace=True,drop=True)

In [55]:
clean_df.sample(2)

Unnamed: 0,DataSource,Vendor number,Vendor name,Clean_Vendor_Name,crisp_vendor_name,New Supplier,New_Supplier,Vendor_name_Initials
73467,Ariba,20013851-A10,POWER CONTROL SYSTEMS INC,power control systems,powercontrolsystems,POWER CONTROL SYSTEMS INC,power control systems inc,p
370969,DS_263065,VN_263065,ENEA S.A. - WLE,enea s a wle,eneasawle,SKARB PANSTWA,skarb panstwa,e


In [56]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374647 entries, 0 to 374646
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   DataSource            374647 non-null  object
 1   Vendor number         374647 non-null  object
 2   Vendor name           374647 non-null  object
 3   Clean_Vendor_Name     374647 non-null  object
 4   crisp_vendor_name     374647 non-null  object
 5   New Supplier          374647 non-null  object
 6   New_Supplier          374647 non-null  object
 7   Vendor_name_Initials  374647 non-null  object
dtypes: object(8)
memory usage: 22.9+ MB


In [57]:
sh_clean_df = clean_df.shape[0]

In [58]:
no_of_null_records2 = sh_df - clean_df.shape[0]
no_of_null_records2

85

In [59]:
clean_df.replace("", np.nan, inplace=True)
clean_df.dropna(subset = ["Clean_Vendor_Name"],inplace=True)
clean_df.dropna(subset = ['New_Supplier'],inplace=True)
clean_ini_list = clean_df['Vendor_name_Initials'].apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x)).unique()
clean_ini_list = np.sort(clean_ini_list)
clean_ini_list = list(filter(str.strip, clean_ini_list))

In [60]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374639 entries, 0 to 374646
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   DataSource            374639 non-null  object
 1   Vendor number         374639 non-null  object
 2   Vendor name           374639 non-null  object
 3   Clean_Vendor_Name     374639 non-null  object
 4   crisp_vendor_name     374639 non-null  object
 5   New Supplier          374639 non-null  object
 6   New_Supplier          374639 non-null  object
 7   Vendor_name_Initials  374639 non-null  object
dtypes: object(8)
memory usage: 25.7+ MB


In [61]:
no_of_null_records3 = sh_clean_df - clean_df.shape[0]
no_of_null_records3

8

In [62]:
# Filtering the Clean data from noisy data with Non-English Vendor Names
clean_df['noise_flag'] = clean_df['Vendor name'].apply(lambda x: True if (re.findall("([^\x00-\x7F])+",x)) else False)
clean_main_df = clean_df[clean_df['noise_flag'] == False]
clean_main_df.reset_index(inplace=True,drop=True)

In [63]:
clean_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374639 entries, 0 to 374638
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   DataSource            374639 non-null  object
 1   Vendor number         374639 non-null  object
 2   Vendor name           374639 non-null  object
 3   Clean_Vendor_Name     374639 non-null  object
 4   crisp_vendor_name     374639 non-null  object
 5   New Supplier          374639 non-null  object
 6   New_Supplier          374639 non-null  object
 7   Vendor_name_Initials  374639 non-null  object
 8   noise_flag            374639 non-null  bool  
dtypes: bool(1), object(8)
memory usage: 23.2+ MB


In [64]:
sh_clean_main_df = clean_main_df.shape[0]
#sh_clean_main_df

In [65]:
no_of_noise_flags = clean_df.shape[0] - clean_main_df.shape[0]
no_of_noise_flags

0

In [69]:
clean_main_df[clean_main_df['Vendor name'] == ('AG PRO')]
# clean_main_df[clean_main_df['Vendor number'] == '40002318']

Unnamed: 0,DataSource,Vendor number,Vendor name,Clean_Vendor_Name,crisp_vendor_name,New Supplier,New_Supplier,Vendor_name_Initials,noise_flag
2994,SAP P10,1105398,AG PRO,ag pro,agpro,AG PRO,ag pro,a,False
161548,DS_53738,VN_53738,AG PRO,ag pro,agpro,AG PRO,ag pro,a,False
243798,DS_136010,VN_136010,AG PRO,ag pro,agpro,AG PRO,ag pro,a,False
326048,DS_218282,VN_218282,AG PRO,ag pro,agpro,AG PRO,ag pro,a,False


In [70]:
clean_main_df.tail(10)

Unnamed: 0,DataSource,Vendor number,Vendor name,Clean_Vendor_Name,crisp_vendor_name,New Supplier,New_Supplier,Vendor_name_Initials,noise_flag
374629,DS_266687,VN_266687,CIM TECH CORP,cim tech,cimtech,CIM TECH,cim tech,c,False
374630,DS_266688,VN_266688,VIEW LINE BVBA,view line bvba,viewlinebvba,VIEW LINE,view line,v,False
374631,DS_266689,VN_266689,Workx Advocaten,workx advocaten,workxadvocaten,WORKX ADVOCATEN,workx advocaten,w,False
374632,DS_266690,VN_266690,TOOL & GAGE ASSOCIATES INC,tool gage associates,toolgageassociates,TOOL & GAGE ASSOCIATES,tool & gage associates,t,False
374633,DS_266691,VN_266691,SmartSD BV,smartsd bv,smartsdbv,SMARTSD,smartsd,s,False
374634,DS_266692,VN_266692,"ROCKFORM CARBIDE MANUFACTURING, INC.",rockform carbide manufacturing,rockformcarbidemanufacturing,ROCKFORM CARBIDE,rockform carbide,r,False
374635,DS_266693,VN_266693,GORE FREIGHT COMPANY LLC,gore freight,gorefreight,GORE FREIGHT COMPANY,gore freight company,g,False
374636,DS_266694,VN_266694,MOUNTZ INC,mountz,mountz,MOUNTZ,mountz,m,False
374637,DS_266695,VN_266695,L&T TECHNOLOGY SERVICES LIMITED,lt technology services limited,lttechnologyserviceslimited,L&T TECHNOLOGY SERVICES,l&t technology services,l,False
374638,DS_266696,VN_266696,HERR INDUSTRIAL INC,herr industrial,herrindustrial,HERR INDUSTRIAL,herr industrial,h,False


In [71]:
tot_no_of_clean_records = clean_main_df.shape[0]
#tot_no_of_clean_records

In [72]:
rest_main_df = clean_df[clean_df['noise_flag'] == True]
rest_main_df.reset_index(inplace=True,drop=True)

In [73]:
rest_main_df.shape[0] # Duplicates may be present. So check the count of 'no_of_unique_dirty_records'

0

In [74]:
rest_main_df.head(2)

Unnamed: 0,DataSource,Vendor number,Vendor name,Clean_Vendor_Name,crisp_vendor_name,New Supplier,New_Supplier,Vendor_name_Initials,noise_flag


In [75]:
rest_main_df.drop_duplicates(subset=['Vendor name'], inplace=True)


Unnamed: 0,DataSource,Vendor number,Vendor name,Clean_Vendor_Name,crisp_vendor_name,New Supplier,New_Supplier,Vendor_name_Initials,noise_flag


In [76]:
no_of_unique_dirty_records = rest_main_df.shape[0] # Unique no of dirty records
no_of_unique_dirty_records

0

In [77]:
rest_main_df = rest_main_df[['DataSource', 'Vendor number','Vendor name', 'New Supplier']]

In [78]:
#This percentage is calculated to know how much irregular unique records are present. Need to check this evertime.
# This percentage is good to be in range of 5~8%
per_irr = format((rest_main_df.shape[0]/clean_df.shape[0])*100, ".3f") # show this to Christian.
if float(per_irr)<8:
    print(f'The percentage of irregular records is below the threshold value (5~8%):  {per_irr}'+'%')
else:
    print(f'The percentage of irregular records is above the threshold value (5~8%):  {per_irr}'+'%')

The percentage of irregular records is below the threshold value (5~8%):  0.000%


## Extracting Bad/uncleaned Master Data: 

In [79]:
if rest_main_df.shape[0]>0:
  rest_main_df.to_excel('Uncleaned_MainData.xlsx',index=False,encoding='utf8')
  files.download('Uncleaned_MainData.xlsx')
else:
  print("No Uncleaned data")

No Uncleaned data


## Bucketing the master data/ Group the initials

In [80]:
grp_clean_main_df_list = bucket_df(clean_main_df,clean_ini_list)

In [81]:
#grp_clean_main_df_list[13]

# Input file:

In [82]:
input_uploads = files.upload()

Saving major_df.xlsx to major_df.xlsx


In [84]:
# Input file extension validation

name_ext_frame = []
for fn in input_uploads.keys():
  name, extension = os.path.splitext(fn)
  ext = extension.replace('.','')
  name_ext_frame.append([name,ext])


for i in range(len(name_ext_frame)):
    if str.lower(name_ext_frame[i][1]) != 'xlsx':
      print('Alert Info:: file with other extension apart from "XLSX":', name_ext_frame[i])
    else:
      print('Info::'+'\n'+
            'File extension:: XLSX')




Info::
File extension:: XLSX


In [85]:
tot_files = len(input_uploads.keys())
tot_files

1

In [86]:
#fn = [fn for fn in input_uploads.keys()][0]
#fn

In [87]:
if tot_files >1:
    inp_frame=[]
    #total = 0
    inp_total = 0
    for fn in input_uploads.keys():
        print(f'Filename: {fn}')
        file = pd.read_excel(io.BytesIO(input_uploads[fn]))
        dimen = file.shape
        nr = dimen[0]
        nc = dimen[1]
        inp_total=inp_total+nr
        print(f'\nNo. of Rows: {nr}\nNo. of cols: {nc}\n')
        print('\n')
        
        file.reset_index(inplace=True,drop=True)
        inp_frame.append(file)
    cons_inp_df = pd.concat(inp_frame,sort=False)
    print(f'total: {inp_total}\n___________________________________________________________________________________________')
    input_raw_df = cons_inp_df.copy()
    refined_inp_col_names = (input_raw_df.columns.to_series(index=range(len(input_raw_df.columns))).apply(lambda x : trim_spaces(x))).to_list()
    input_raw_df.columns=refined_inp_col_names
else:
    print('Else- section')
    inp_total = 0
    fn = [fn for fn in input_uploads.keys()][0]
    print(f'Filename: {fn}')
    file = pd.read_excel(io.BytesIO(input_uploads[fn]))
    #file = file.head(2000)  #kept this line for testing purpose, keep the number as per your testing, top N records
    #file = file.sample(2000) #kept this line for testing purpose, keep the number as per your testing, Random records
    file.reset_index(inplace=True,drop=True) #kept this line for testing purpose, keep the number as per your testing, Random records
    dimen = file.shape
    nr = dimen[0]
    nc = dimen[1]
    inp_total=inp_total+nr
    print(f'\nNo. of Rows: {nr}\nNo. of cols: {nc}\n')
    print('\n')
    print(f'total: {inp_total}\n_________________________________________________________________________________________________')
    input_raw_df = file.copy()
    refined_inp_col_names = (input_raw_df.columns.to_series(index=range(len(input_raw_df.columns))).apply(lambda x : trim_spaces(str(x)))).to_list()
    input_raw_df.columns=refined_inp_col_names


Else- section
Filename: major_df.xlsx

No. of Rows: 9507
No. of cols: 7



total: 9507
_________________________________________________________________________________________________


In [88]:
no_of_raw_inp_records = inp_total
no_of_raw_inp_records

9507

In [89]:
# give the input column names that is required and need to drop the rest 
acp_inp_col_names = ['DataSource','Vendor number','Vendor name','Spend (USD)','Segment','Primary Category'] 

In [90]:
input_raw_df.head()

Unnamed: 0,DataSource,Vendor number,New Supplier,Vendor name,Spend (USD),Segment,Primary Category
0,68. 3rd Party Security,34001,4Launch,ACES Direct (4LAUNCH),3281.53803,SECURITY,Finished Goods
1,09. SAP P10,1107212,a &,A & F ELECTRIC LLC,44000.0,SECURITY,General Procurement
2,22. Syteline IES,9992470-DEX,a & a Engineering Civil and Structural Engineers,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,3450.0,INDUSTRIAL,Undefined
3,09. SAP P10,1072125,a & a Hydraulic Repair CO DIV of Mcgivern,A & A HYDRAULIC REPAIR CO/DIV OF MCGIVERN ENT INC,114.58,GTS,Undefined
4,70. JDE CAM,177662,a & a Machine & Develop,A & A MACHINE & DEVELOP,294.793886,INDUSTRIAL,General Procurement


In [91]:
drop_cols_df(acp_inp_col_names,input_raw_df)

In [92]:
input_raw_df.columns

Index(['DataSource', 'Vendor number', 'Vendor name', 'Spend (USD)', 'Segment',
       'Primary Category'],
      dtype='object')

In [93]:
#input_raw_df[input_raw_df.duplicated()==True].count()#.unique()[0]
input_raw_df.reset_index(inplace=True,drop=True)

In [94]:
count_dup,dup_df = check_dup_count(input_raw_df)
count_dup


Total Duplicates in Dataset:  0


0

In [95]:
dup_df # Here we are getting duplicate records

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category


In [96]:
input_raw_df.replace(" ", np.nan, inplace=True)
input_raw_df.replace("", np.nan, inplace=True)

In [97]:
input_raw_df.isnull().sum()

DataSource          0
Vendor number       0
Vendor name         0
Spend (USD)         0
Segment             0
Primary Category    0
dtype: int64

In [98]:
input_raw_df_null_records = input_raw_df[input_raw_df['Vendor name'].isnull()]
input_raw_df_null_records_counts = len(input_raw_df_null_records)
input_raw_df_null_records_counts

0

In [99]:
input_raw_df_null_records

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category


In [100]:
if input_raw_df_null_records_counts>0:
  input_raw_df_null_records.to_excel('input file null records.xlsx',index=False,encoding='utf8')
  files.download('input file null records.xlsx')
else:
  print('No Null records in input file')

No Null records in input file


In [101]:
# Drop Na's if any:
count_na_inp_raw_df = input_raw_df.isnull().sum().max()
print("Raw Dataframe NA Count: ",count_na_inp_raw_df)
if count_na_inp_raw_df>0:
  na_dict  = input_raw_df.isnull().sum().to_dict()#:  #.max()>0
  for k,v in na_dict.items():
    if v>0:
      if (v == input_raw_df.isnull().sum().max()) and (k in ['DataSource',	'Vendor number',	'Vendor name']):
        print("The column with maximum NA counts: ",k)
        print("Info:   Dropping the NAs from the Dataframe:::::::::::")
        input_raw_df.dropna(inplace=True)
        input_raw_df.reset_index(inplace=True,drop=True)


# Drop duplicates if any:
if count_dup>0:
  input_raw_df.drop_duplicates(inplace=True)


# Now reset the index
input_raw_df.reset_index(inplace=True,drop=True)

Raw Dataframe NA Count:  0


In [102]:
#check_dup_count(input_raw_df)

In [103]:
input_raw_df.sort_index(inplace=True)
input_raw_df.reset_index(drop=True, inplace=True)

In [104]:
input_raw_df.shape

(9507, 6)

In [105]:
# Method 1:
#if (input_raw_df_null_records_counts == (inp_total - input_raw_df.shape[0])) and dup_count != input_raw_df_null_records_counts:
#  no_of_null_inp_records1 = input_raw_df_null_records_counts
#else:
#  no_of_null_inp_records1 = inp_total - input_raw_df.shape[0]

# Method 2:
no_of_null_inp_records1 = input_raw_df_null_records_counts
no_of_null_inp_records1

0

In [106]:
foreign_count_raw = count_foreign(input_raw_df,'Vendor name')
print(f'Foreign name/names count: {foreign_count_raw}') # if foreign counts greater than 1 then fetch the records by implementing noise flag


Foreign name/names count: 92


In [107]:
# Comment this section since Christian asked not to remove rather to have in file by translating 
#if foreign_count>0:
#  input_raw_df['noise_flag'] = input_raw_df['Vendor name'].apply(lambda x: True if (re.findall("([^\x00-\x7F])+",x)) else False)
#  foreign_inp_df = input_raw_df[input_raw_df['noise_flag'] == True]
#  foreign_inp_df.to_excel('foreign input data.xlsx',index=False,encoding='utf8')
#  files.download('foreign input data.xlsx')
  
  # Also filter non foreign data from input data
#  input_raw_df = input_raw_df[input_raw_df['noise_flag'] == False]
#  input_raw_df.drop(columns=['noise_flag'],axis = 1, inplace=True)
#  input_raw_df.reset_index(drop=True,inplace=True)


In [108]:
input_raw_df.shape[0]

9507

In [109]:
#input_raw_df.head(2)

In [110]:
#lingual_check(input_raw_df)

In [111]:
inp_match_df = input_raw_df.copy()

In [112]:
#inp_match_df.shape

In [113]:
#lingual_check(inp_match_df)

In [114]:
#inp_match_df.head(2)

In [115]:
# Input data cleaning

input_df = input_df_data_clean(input_raw_df)
input_df = input_df_data_ini(input_df)
input_df = crisp_vendor_name(input_df)

# this DS cleaning is optional
for col in input_df.columns:
  if col == 'DataSource':
    input_df['DataSource']=input_df['DataSource'].apply(ds_cleansing)

In [116]:
foreign_count = count_foreign(input_df,'Vendor name')
if foreign_count==0 and foreign_count_raw>0:  
  print(f'Foreign name/names count Translated: {foreign_count_raw}') # if foreign counts greater than 1 then fetch the records by implementing noise flag
else:
  print(f'Foreign name/names count not Translated: {foreign_count}')


Foreign name/names count Translated: 92


In [117]:
input_df.shape[0]

9507

In [118]:
records_removed_after_data_cleaning = inp_match_df.shape[0] - input_df.shape[0]
records_removed_after_data_cleaning

0

In [119]:
input_df.columns

Index(['DataSource', 'Vendor number', 'Vendor name', 'Spend (USD)', 'Segment',
       'Primary Category', 'Clean_Vendor_Name', 'Vendor_name_Initials',
       'crisp_vendor_name'],
      dtype='object')

In [120]:
for col in input_df.columns:
  if col in ['DataSource','Vendor number']:
    input_df['DataSource'] = input_df['DataSource'].apply(lambda x: str(x))
    input_df['Vendor number'] = input_df['Vendor number'].apply(lambda x: str(x))
    input_df['DataSource'] = input_df['DataSource'].apply(trim_spaces)
    input_df['Vendor number'] = input_df['Vendor number'].apply(trim_spaces)
  else:
    pass


In [121]:
lingual_check(input_df)

no match


In [122]:
input_df.shape[0]

9507

In [123]:
input_df.reset_index(inplace=True,drop=True)

In [124]:
input_df

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category,Clean_Vendor_Name,Vendor_name_Initials,crisp_vendor_name
0,3rd Party Security,34001,ACES Direct (4LAUNCH),3.281538e+03,SECURITY,Finished Goods,aces direct 4launch,a,acesdirect4launch
1,SAP P10,1107212,A & F ELECTRIC LLC,4.400000e+04,SECURITY,General Procurement,a f electric,a,afelectric
2,Syteline IES,9992470-DEX,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,3.450000e+03,INDUSTRIAL,Undefined,a a engineering civil and structural engineers,a,aaengineeringcivilandstructuralengineers
3,SAP P10,1072125,A & A HYDRAULIC REPAIR CO/DIV OF MCGIVERN ENT INC,1.145800e+02,GTS,Undefined,a a hydraulic repair div of mcgivern ent,a,aahydraulicrepairdivofmcgivernent
4,JDE CAM,177662,A & A MACHINE & DEVELOP,2.947939e+02,INDUSTRIAL,General Procurement,a a machine develop,a,aamachinedevelop
...,...,...,...,...,...,...,...,...,...
9502,INBUS,VCB0224-KOREA_SB,"All Scale Co., Ltd.",4.409710e+02,INDUSTRIAL,General Procurement,all scale,a,allscale
9503,SAP C11,30113432,"Accenture (China) Co., Ltd. / Accenture (China...",9.799743e+05,Undefined,Undefined,accenture china accenture china,a,accenturechinaaccenturechina
9504,UFIDA - SH,6012,"ANCHOR FASTENERS IND.CO.,LTD",5.138550e+03,INDUSTRIAL,Undefined,anchor fasteners ind,a,anchorfastenersind
9505,SAP E03,180427,"AKASHIDENKI CO., LTD.",8.589643e+01,GTS,Undefined,akashidenki,a,akashidenki


In [125]:
count_col = 0
for col in input_df.columns:
  if col in acp_inp_col_names:
    count_col = count_col +1
print(count_col)
input_df['Vendor_Flag']=np.nan
if count_col==1:
  for i in tqdm(range(len(input_df))):
    vendorFlag_wo_VN(i)
  input_df['DataSource']=np.nan
  input_df['Vendor number']=np.nan
  
else:
  for i in tqdm(range(len(input_df))):
    vendorFlag(i)



6


  0%|          | 0/9507 [00:00<?, ?it/s]

In [126]:
input_df.Vendor_Flag.unique()


array(['New', 'Old'], dtype=object)

In [127]:
input_df.shape[0]

9507

In [128]:
#stop

In [129]:
input_df['Vendor name'].nunique()

8491

In [130]:
input_df.replace(" ", np.nan, inplace=True)

In [131]:
input_df.shape[0]

9507

In [132]:
inp_df_shape1 = input_df.shape[0]
inp_df_shape1

9507

In [133]:
input_df.isna().sum()

DataSource              0
Vendor number           0
Vendor name             0
Spend (USD)             0
Segment                 0
Primary Category        0
Clean_Vendor_Name       0
Vendor_name_Initials    0
crisp_vendor_name       0
Vendor_Flag             0
dtype: int64

In [134]:
if input_df['Vendor_name_Initials'].isna().sum() > 0:
  input_df_null_Vn_initial_records = input_df[input_df['Vendor_name_Initials'].isnull()]
  input_df_null_Vn_initial_records.to_excel('input df null Vn initials.xlsx',index=False,encoding='utf8')
  files.download('input df null Vn initials.xlsx')
  input_df.dropna(subset = ["Vendor_name_Initials"],inplace=True) 
  
input_df.shape[0] # also add a condition if any NULLs are present in the initials. And take the count for the final info
# this code can also be alter where one choose to have the filtered input data 

9507

In [135]:
# dropping NA again IF there is any while creating Vendor_name_Initials else it will be zero
no_of_null_inp_records2 = inp_df_shape1 - input_df.shape[0]
no_of_null_inp_records2

0

In [136]:
input_df.shape

(9507, 10)

In [137]:
input_df['Vendor name'].isna().sum()

0

In [138]:
input_df.reset_index(inplace=True,drop=True)

In [139]:
#input_df[input_df['Clean_Vendor_Name'] == input_df['Clean_Vendor_Name'].iloc[276]]

In [140]:
input_df.shape[0]

9507

In [141]:
n= input_df.Vendor_Flag.nunique()
print(n)

2


In [142]:
new_records_with_dup = input_df[input_df.Vendor_Flag =='New'].shape[0] #Duplicate vendor names
new_records_with_dup

1344

In [143]:
old_records_with_dup = input_df[input_df.Vendor_Flag =='Old'].shape[0] #Duplicate vendor names
old_records_with_dup

8163

In [144]:
inp_new_flag_df = input_df[input_df.Vendor_Flag =='New']
inp_new_flag_df.reset_index(drop=True,inplace=True)
inp_new_flag_df.head(5)

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category,Clean_Vendor_Name,Vendor_name_Initials,crisp_vendor_name,Vendor_Flag
0,3rd Party Security,34001,ACES Direct (4LAUNCH),3281.53803,SECURITY,Finished Goods,aces direct 4launch,a,acesdirect4launch,New
1,SAP P10,1107212,A & F ELECTRIC LLC,44000.0,SECURITY,General Procurement,a f electric,a,afelectric,New
2,SAP P10,1072125,A & A HYDRAULIC REPAIR CO/DIV OF MCGIVERN ENT INC,114.58,GTS,Undefined,a a hydraulic repair div of mcgivern ent,a,aahydraulicrepairdivofmcgivernent,New
3,Navision Hydraulics,92574-311,A & A SAFETY INC,0.0,INDUSTRIAL,Not Controllable,a a safety,a,aasafety,New
4,JDE CAM,141835,A & S CUT OFF CENTER INC.,7810.42,INDUSTRIAL,Commodities,a s cut off center,a,ascutoffcenter,New


In [145]:
input_df.nunique()

DataSource                52
Vendor number           9422
Vendor name             8491
Spend (USD)             4841
Segment                    5
Primary Category           9
Clean_Vendor_Name       8026
Vendor_name_Initials       1
crisp_vendor_name       7974
Vendor_Flag                2
dtype: int64

In [146]:
new_records = inp_new_flag_df['Vendor name'].count()
new_record_non_dup = inp_new_flag_df['Vendor name'].nunique()
print(f'Count of new records with duplicate vendor names: {new_records}\nCount of new records with unique vendor names: {new_record_non_dup}')

Count of new records with duplicate vendor names: 1344
Count of new records with unique vendor names: 1303


In [147]:
inp_old_flag_df = input_df[input_df.Vendor_Flag == 'Old']
inp_old_flag_df.reset_index(drop=True,inplace=True)
inp_old_flag_df.shape[0]

8163

In [148]:
old_records = inp_old_flag_df['Vendor name'].count()
old_records_non_dup = inp_old_flag_df['Vendor name'].nunique()
print(f'Count of old records with duplicate vendor names: {old_records}\nCount of old records with unique vendor names: {old_records_non_dup}')

# summation of "new_record_non_dup" + "old_records_non_dup" != "tot_no_of_unique_vendors" because of the condtion of NEW and OLD. Since, if the same record would be there in OLD records, irrespective of the condition, then Nunique would have return
# the same result as "tot_no_of_unique_vendors". So the record which should have been in OLD that moved to NEW, so the numbers changed.
# The correct result is "tot_no_of_unique_vendors". SO, "tot_no_of_unique_vendors"<=  "new_record_non_dup" + "old_records_non_dup"

Count of old records with duplicate vendor names: 8163
Count of old records with unique vendor names: 7261


In [149]:
inp_old_flag_df.head()

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category,Clean_Vendor_Name,Vendor_name_Initials,crisp_vendor_name,Vendor_Flag
0,Syteline IES,9992470-DEX,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,3450.0,INDUSTRIAL,Undefined,a a engineering civil and structural engineers,a,aaengineeringcivilandstructuralengineers,Old
1,JDE CAM,177662,A & A MACHINE & DEVELOP,294.793886,INDUSTRIAL,General Procurement,a a machine develop,a,aamachinedevelop,Old
2,SAP C11,30101537,A & A TROPHY HOUSE INC/DBA A&A AWARDS & PROMOT...,0.0,GTS,General Procurement,a a trophy house dba aa awards promotionals,a,aatrophyhousedbaaaawardspromotionals,Old
3,Ariba,30101537-A10,A & A TROPHY HOUSE INC/DBA A&A AWARDS & PROMOT...,0.0,GTS,General Procurement,a a trophy house dba aa awards promotionals,a,aatrophyhousedbaaaawardspromotionals,Old
4,Ariba,1099744-A9,A & A TROPHY HOUSE INC/A & A AWARDS & PROMOTIO...,31207.08,CORPORATE,General Procurement,a a trophy house a a awards promotionals,a,aatrophyhouseaaawardspromotionals,Old


In [150]:
merged_inp_old_flag_df = pd.merge(inp_old_flag_df, clean_main_df['Clean_Vendor_Name'], on=['Clean_Vendor_Name'], how='inner')
##merged_inp_old_flag_df.dropna(subset=['Clean_Vendor_Name','Vendor_name_Initials'],inplace=True)
merged_inp_old_flag_df.drop_duplicates(subset=['Vendor name'],inplace=True)
merged_inp_old_flag_df.reset_index(drop=True,inplace=True)

In [151]:
merged_inp_old_flag_df.head()

Unnamed: 0,DataSource,Vendor number,Vendor name,Spend (USD),Segment,Primary Category,Clean_Vendor_Name,Vendor_name_Initials,crisp_vendor_name,Vendor_Flag
0,Syteline IES,9992470-DEX,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,3450.0,INDUSTRIAL,Undefined,a a engineering civil and structural engineers,a,aaengineeringcivilandstructuralengineers,Old
1,JDE CAM,177662,A & A MACHINE & DEVELOP,294.793886,INDUSTRIAL,General Procurement,a a machine develop,a,aamachinedevelop,Old
2,SAP C11,30101537,A & A TROPHY HOUSE INC/DBA A&A AWARDS & PROMOT...,0.0,GTS,General Procurement,a a trophy house dba aa awards promotionals,a,aatrophyhousedbaaaawardspromotionals,Old
3,Ariba,1099744-A9,A & A TROPHY HOUSE INC/A & A AWARDS & PROMOTIO...,31207.08,CORPORATE,General Procurement,a a trophy house a a awards promotionals,a,aatrophyhouseaaawardspromotionals,Old
4,3rd Party Security,108742,A & B JERNVARE A/S,63.09746,SECURITY,Finished Goods,a b jernvare a s,a,abjernvareas,Old


In [152]:

merged_inp_old_flag_df.reset_index(drop=True,inplace=True)

if merged_inp_old_flag_df[merged_inp_old_flag_df.duplicated(subset='Vendor name')].shape[0]>0:
  merged_inp_old_flag_df[merged_inp_old_flag_df.drop_duplicates(subset='Vendor name')]


merged_inp_old_flag_df_VN_list = merged_inp_old_flag_df['Clean_Vendor_Name'].tolist()

#no_of_fit_old_records
no_of_fit_old_records = merged_inp_old_flag_df.shape[0]

In [153]:
#to take out tthe unfit data
# unfit data tells about those vendor names were the names are not present correctly
# As unfit data is not required to have seperately so COMMENT this as of now
#inp_old_flag_df_unfit= inp_old_flag_df[~inp_old_flag_df['Clean_Vendor_Name'].isin(merged_inp_old_flag_df_VN_list)]
#inp_old_flag_df_unfit[inp_old_flag_df_unfit.duplicated(subset='Vendor name')].shape[0]
#inp_old_flag_df_unfit.to_excel(f'Input_old_unfit_data.xlsx',index=False,encoding='utf8')
#files.download('Input_old_unfit_data.xlsx')
#inp_old_flag_df_unfit.shape[0]

In [154]:
input_df.shape[0]

9507

In [155]:
tot_no_of_null_inp_records = no_of_null_inp_records1 + no_of_null_inp_records2 
tot_no_of_null_inp_records

0

In [156]:
# Since we are not taking out and considering the irregular old matched vendor names EX: Samsung == Sam sung. Since this two are completely different but they are old # Ask whether he wants to have such list or not/ Counts
#no_of_unfit_old_records = inp_old_flag_df.shape[0] - no_of_fit_old_records
#no_of_unfit_old_records


In [157]:
tot_no_of_clean_inp_records = input_df.shape[0]# this contains duplicate Vendon names but combination of all columns are unique
tot_no_of_unique_vendors = input_df['Vendor name'].nunique() 

In [158]:
#input_df = input_df.sample(100)
input_df.reset_index(drop=True,inplace=True)


In [159]:
#stop

In [160]:
#input_df['Vendor_Flag'].unique() == 'Old'

In [161]:

if n==2:
    inp_df_New, inp_df_Old = [x for _, x in input_df.groupby(input_df['Vendor_Flag'] == 'Old')]
    print(f"Unique Vendor counts: {inp_df_Old['Vendor name'].nunique()}")
    print(f"old record counts: {len(inp_df_Old['Vendor name'])}")

    inp_ini_list = np.sort(inp_df_Old.Vendor_name_Initials.unique())
    grp_input_df_new=[]
    
    # Cleaning chamber:
    inp_df_New_output=inp_CleaningChamber(inp_df_New) #added new line after this line # this is one of the Dataframe
    inp_df_New_output.drop(columns=['Vendor_name_Initials','Clean_Vendor_Name'],axis=1,inplace=True)
    inp_df_New_output['Vendor_Name_cleansed'] = inp_df_New_output['Vendor_Name_cleansed'].apply(lambda x: x.upper())
    # write a condition that if there is no values in datasource or vendor number then  replace the same with nan
    
    inp_df_New_output['Manual Harmonized']=np.nan
    inp_df_New_output['Match Ratio']=np.nan
    inp_df_New_output['True Match Flag']=np.nan
    inp_df_New_output['Match_word_len'] = np.nan
    
    inp_df_New_output= inp_df_New_output[['DataSource','Vendor number','Vendor name','Vendor_Name_cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']]
    
    inp_df_New_output.columns=['DataSource','Vendor number','Vendor name','Predicted Names/ Model Cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']

    inp_df_New_output.reset_index(drop=True,inplace=True)
    
    
    # Model chamber:
    inp_ini_list = inp_ini_list.tolist()
    print(f"Again old record counts: {len(inp_df_Old['Vendor name'])}")
    grp_input_df = bucket_df(inp_df_Old,inp_ini_list)
    
    
    final_frame = []
    for i in tqdm(range(len(inp_ini_list))):
        for j in range(len(clean_ini_list)):
            if inp_ini_list[i]==clean_ini_list[j]:
                # we need to pass index ('j') to model defination. After this you can do global to name the DF's
                print('index value of clean_ini_list to be matched with inp_ini_list : ',j)
                print(inp_ini_list[i])
                print(clean_ini_list[j])                
                time_taken,grp_number,sgd_classifier=sgd_clf(j)
                print(f'[Info] Time taken to train group member "{grp_number}": {time_taken}')
                
                df_list = grp_input_df[i]['Clean_Vendor_Name'].values
                #print(df_list)
                frame=[]
                pred_st_time =  datetime.now()
                for k in tqdm(df_list):
                    predict_val = pd.DataFrame(sgd_classifier.predict(count_vect.transform([str(k)])))
                    frame.append(predict_val)
                inter_frame = pd.concat(frame).reset_index()
                inter_frame.drop('index',axis=1,inplace=True)
                frame_df_list= pd.DataFrame(df_list)
                row_val= pd.concat([frame_df_list, inter_frame], axis = 1) # try global here
                pred_end_time =  datetime.now()
                pred_time_taken = pred_end_time - pred_st_time
                pred_time_taken = time_util(pred_time_taken)
                print(f'[Info] Time taken to predict {grp_number}: {pred_time_taken}')
                print('-----------------------------------------------------------------------------------------------')
                final_frame.append(row_val)  
    final_output = pd.concat(final_frame)
    final_output.reset_index(inplace=True)
    final_output.drop('index',axis=1,inplace=True)
    final_output.columns=['Vendor_names','New_Supplier_Pred']
    final_output['New_Supplier_Pred'] = final_output['New_Supplier_Pred'].apply(lambda x: x.upper())
    final_output['New_Supplier_Pred'] = final_output['New_Supplier_Pred'].apply(lambda x: x.upper())
    clean_main_df.sort_values(by=['Clean_Vendor_Name'])
    final_output.drop_duplicates(inplace=True)
     
    merged_final = pd.merge(final_output,inp_df_Old,left_on='Vendor_names',right_on='Clean_Vendor_Name',how='left')   
        ##merged_final.drop_duplicates(subset=['Vendor name'],inplace=True)
    merged_final.drop(columns=['Vendor_names','Vendor_name_Initials'],axis=1,inplace=True)
    merged_final = merged_final[['DataSource','Vendor number','Vendor name','New_Supplier_Pred','Clean_Vendor_Name','crisp_vendor_name','Vendor_Flag', 'Spend (USD)','Segment','Primary Category']]
    #clean_main_df_test.drop_duplicates(inplace=True) #uncomment this and drop duplicates from clean_main_df if needed    
    merged_final =  pd.merge(merged_final,clean_main_df[['DataSource',	'Vendor number','crisp_vendor_name','New Supplier']],left_on=['DataSource','Vendor number','crisp_vendor_name'],right_on=['DataSource','Vendor number','crisp_vendor_name'],how='left') # previously used 'Clean_Vendor_Name' and removed 'Clean_Vendor_Name' from clean_main_df col list
    merged_final.drop_duplicates(inplace=True) #subset=['Vendor name'],

        
    ##merged_final.drop(columns=['Clean_Vendor_Name','crisp_vendor_name'],axis=1,inplace=True)
    merged_final = merged_final[['DataSource','Vendor number','Vendor name','New_Supplier_Pred','New Supplier','Vendor_Flag','Spend (USD)','Segment','Primary Category']] 
    merged_final.columns=['DataSource','Vendor number','Vendor name','Model_pred_Supplier','Prev_Predicted_Supplier','Vendor_Flag','Spend (USD)','Segment','Primary Category']
    merged_final['Prev_Predicted_Supplier'] = merged_final['Prev_Predicted_Supplier'].apply(lambda x: str(x).upper())
    merged_final['Match_ratio']= merged_final.apply(lambda x: fuzz.ratio(str(x['Model_pred_Supplier']),str(x['Prev_Predicted_Supplier'])),axis=1)
    merged_final['Match_word_len'] = np.nan
    merged_final['True Match Flag']= ''
    merged_final.reset_index(drop=True,inplace=True)

    for i in tqdm(range(len(merged_final))):
        Match_wordLength(i)

    for i in tqdm(range(len(merged_final))):
      if (str(merged_final['Match_ratio'][i])=='100') & (merged_final['Match_word_len'][i] == 'Same Length'):
          merged_final['True Match Flag'].loc[i]= 'Yes'
      else:
          merged_final['True Match Flag'].loc[i]= 'No'

    merged_final.columns=['DataSource','Vendor number','Vendor name','Predicted Names/ Model Cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']
    merged_final['DataSource']=merged_final['DataSource'].apply(ds_cleansing)

    merged_final= final_output_sup_name_Cleaning(merged_final,'Predicted Names/ Model Cleansed')
    merged_final= final_output_sup_name_Cleaning(merged_final,'Manual Harmonized')


elif(n==1 and (input_df['Vendor_Flag'].unique() == 'New')):
  # put logic when the output is needed for only OLD records and n==1 
  # write a elif condition that if n==1 and only New records are there then
    print('Elif section is running for NEW...')
    print(input_df['Vendor name'].nunique())

    inp_ini_list = np.sort(input_df.Vendor_name_Initials.unique())
        
    # Cleaning chamber:
    inp_df_New_output=inp_CleaningChamber(input_df) #added new line after this line # this is one of the Dataframe
    inp_df_New_output.drop(columns=['Vendor_name_Initials','Clean_Vendor_Name'],axis=1,inplace=True)
    inp_df_New_output['Vendor_Name_cleansed'] = inp_df_New_output['Vendor_Name_cleansed'].apply(lambda x: x.upper())
    # write a condition that if there is no values in datasource or vendor number then  replace the same with nan
    
    inp_df_New_output['Manual Harmonized']=np.nan
    inp_df_New_output['Match Ratio']=np.nan
    inp_df_New_output['True Match Flag']=np.nan
    inp_df_New_output['Match_word_len'] = np.nan
    
    inp_df_New_output= inp_df_New_output[['DataSource','Vendor number','Vendor name','Vendor_Name_cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']]
    
    inp_df_New_output.columns=['DataSource','Vendor number','Vendor name','Predicted Names/ Model Cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']

    inp_df_New_output.reset_index(drop=True,inplace=True)
    
elif(n==1 and (input_df['Vendor_Flag'].unique() == 'Old')):
  # put logic when the output is needed for only OLD records and n==1 
  # write a elif condition that if n==1 and only New records are there then
  print('Elif section is running  for Old records...')
  print(input_df['Vendor name'].nunique())

  inp_ini_list = np.sort(input_df.Vendor_name_Initials.unique())  
  # Model chamber:
  inp_ini_list = inp_ini_list.tolist()
  grp_input_df = bucket_df(input_df,inp_ini_list)
  
  final_frame = []
  for i in tqdm(range(len(inp_ini_list))):
      for j in range(len(clean_ini_list)):
          if inp_ini_list[i]==clean_ini_list[j]:
              # we need to pass index ('j') to model defination. After this you can do global to name the DF's
              print('index value of clean_ini_list to be matched with inp_ini_list : ',j)
              print(inp_ini_list[i])
              print(clean_ini_list[j])                
              time_taken,grp_number,sgd_classifier=sgd_clf(j)
              print(f'[Info] Time taken to train group member "{grp_number}": {time_taken}')
              
              df_list = grp_input_df[i]['Clean_Vendor_Name'].values
              #print(df_list)
              frame=[]
              pred_st_time =  datetime.now()
              for k in tqdm(df_list):
                  predict_val = pd.DataFrame(sgd_classifier.predict(count_vect.transform([str(k)])))
                  frame.append(predict_val)
              inter_frame = pd.concat(frame).reset_index()
              inter_frame.drop('index',axis=1,inplace=True)
              frame_df_list= pd.DataFrame(df_list)
              row_val= pd.concat([frame_df_list, inter_frame], axis = 1) # try global here
              pred_end_time =  datetime.now()
              pred_time_taken = pred_end_time - pred_st_time
              pred_time_taken = time_util(pred_time_taken)
              print(f'[Info] Time taken to predict {grp_number}: {pred_time_taken}')
              print('-----------------------------------------------------------------------------------------------')
              final_frame.append(row_val)  
  final_output = pd.concat(final_frame)
  final_output.reset_index(inplace=True)
  final_output.drop('index',axis=1,inplace=True)
  final_output.columns=['Vendor_names','New_Supplier_Pred']
  final_output['New_Supplier_Pred'] = final_output['New_Supplier_Pred'].apply(lambda x: x.upper())
  final_output['New_Supplier_Pred'] = final_output['New_Supplier_Pred'].apply(lambda x: x.upper())
  clean_main_df.sort_values(by=['Clean_Vendor_Name'])
  final_output.drop_duplicates(inplace=True)
    
  merged_final = pd.merge(final_output,inp_df_Old,left_on='Vendor_names',right_on='Clean_Vendor_Name',how='left')   
      ##merged_final.drop_duplicates(subset=['Vendor name'],inplace=True)
  merged_final.drop(columns=['Vendor_names','Vendor_name_Initials'],axis=1,inplace=True)
  merged_final = merged_final[['DataSource','Vendor number','Vendor name','New_Supplier_Pred','Clean_Vendor_Name','crisp_vendor_name','Vendor_Flag', 'Spend (USD)','Segment','Primary Category']]
  #clean_main_df_test.drop_duplicates(inplace=True) #uncomment this and drop duplicates from clean_main_df if needed    
  merged_final =  pd.merge(merged_final,clean_main_df[['DataSource',	'Vendor number','crisp_vendor_name','New Supplier']],left_on=['DataSource','Vendor number','crisp_vendor_name'],right_on=['DataSource','Vendor number','crisp_vendor_name'],how='left') # previously used 'Clean_Vendor_Name' and removed 'Clean_Vendor_Name' from clean_main_df col list
  merged_final.drop_duplicates(inplace=True) #subset=['Vendor name'],

      
  ##merged_final.drop(columns=['Clean_Vendor_Name','crisp_vendor_name'],axis=1,inplace=True)
  merged_final = merged_final[['DataSource','Vendor number','Vendor name','New_Supplier_Pred','New Supplier','Vendor_Flag','Spend (USD)','Segment','Primary Category']] 
  merged_final.columns=['DataSource','Vendor number','Vendor name','Model_pred_Supplier','Prev_Predicted_Supplier','Vendor_Flag','Spend (USD)','Segment','Primary Category']
  merged_final['Prev_Predicted_Supplier'] = merged_final['Prev_Predicted_Supplier'].apply(lambda x: str(x).upper())
  merged_final['Match_ratio']= merged_final.apply(lambda x: fuzz.ratio(str(x['Model_pred_Supplier']),str(x['Prev_Predicted_Supplier'])),axis=1)
  merged_final['Match_word_len'] = np.nan
  merged_final['True Match Flag']= ''
  merged_final.reset_index(drop=True,inplace=True)

  for i in tqdm(range(len(merged_final))):
      Match_wordLength(i)

  for i in tqdm(range(len(merged_final))):
    if (str(merged_final['Match_ratio'][i])=='100') & (merged_final['Match_word_len'][i] == 'Same Length'):
        merged_final['True Match Flag'].loc[i]= 'Yes'
    else:
        merged_final['True Match Flag'].loc[i]= 'No'

  merged_final.columns=['DataSource','Vendor number','Vendor name','Predicted Names/ Model Cleansed','Manual Harmonized','Vendor_Flag','Spend (USD)','Segment','Primary Category','Match Ratio','Match_word_len','True Match Flag']
  merged_final['DataSource']=merged_final['DataSource'].apply(ds_cleansing)

  merged_final= final_output_sup_name_Cleaning(merged_final,'Predicted Names/ Model Cleansed')
  merged_final= final_output_sup_name_Cleaning(merged_final,'Manual Harmonized')

  
  
  
  

Unique Vendor counts: 7261
old record counts: 8163
Again old record counts: 8163


  0%|          | 0/1 [00:00<?, ?it/s]

index value of clean_ini_list to be matched with inp_ini_list :  10
a
a
[Info] Time taken to train group member "a": 2 mins & 5 secs


  0%|          | 0/8163 [00:00<?, ?it/s]

[Info] Time taken to predict a: 107 mins & 22 secs
-----------------------------------------------------------------------------------------------


  0%|          | 0/8163 [00:00<?, ?it/s]

  0%|          | 0/8163 [00:00<?, ?it/s]

In [162]:
#merged_final[merged_final.duplicated()==True]
#merged_final1.drop_duplicates(inplace=True)

In [163]:
#merged_final

In [164]:
#clean_main_df_test = clean_main_df[clean_main_df.crisp_vendor_name.str.contains('48forty')]
#clean_main_df_test.drop_duplicates(inplace=True)
#clean_main_df_test[clean_main_df_test.crisp_vendor_name.str.contains('48forty')]

In [165]:
#merged_final[merged_final.crisp_vendor_name.str.contains('48forty')]

In [166]:
merged_final

Unnamed: 0,DataSource,Vendor number,Vendor name,Predicted Names/ Model Cleansed,Manual Harmonized,Vendor_Flag,Spend (USD),Segment,Primary Category,Match Ratio,Match_word_len,True Match Flag
0,Syteline IES,9992470-DEX,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,A & A ENGINEERING CIVIL AND STRUCTURAL ENGINEE...,Old,3450.000000,INDUSTRIAL,Undefined,100,Same Length,Yes
1,JDE CAM,177662,A & A MACHINE & DEVELOP,A & A MACHINE & DEVELOP,A & A MACHINE & DEVELOP,Old,294.793886,INDUSTRIAL,General Procurement,100,Same Length,Yes
2,SAP C11,30101537,A & A TROPHY HOUSE INC/DBA A&A AWARDS & PROMOT...,A&A AWARDS & PROMOTIONALS,A&A AWARDS & PROMOTIONALS,Old,0.000000,GTS,General Procurement,100,Same Length,Yes
3,Ariba,30101537-A10,A & A TROPHY HOUSE INC/DBA A&A AWARDS & PROMOT...,A&A AWARDS & PROMOTIONALS,A&A AWARDS & PROMOTIONALS,Old,0.000000,GTS,General Procurement,100,Same Length,Yes
4,Ariba,1099744-A9,A & A TROPHY HOUSE INC/A & A AWARDS & PROMOTIO...,A&A AWARDS & PROMOTIONALS,A&A AWARDS & PROMOTIONALS,Old,31207.080000,CORPORATE,General Procurement,100,Same Length,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
8158,3rd Party Security,601127,Agam-Flame Service Center Ltd,IMAGE,YOUR IMAGE,Old,372.161686,GTS,General Procurement,67,Different Length,No
8159,3rd Party Security,601034,Agam Shivuk Net Ltd.,AGAM SHIVUK NET LTD.,AGAM SHIVUK NET LTD.,Old,5360.041475,GTS,General Procurement,100,Same Length,Yes
8160,SAP E03,175269,ANTIGON OF PARISIAN CO. EU,ANTIGON OF PARISIAN CO. EU,ANTIGON OF PARISIAN CO. EU,Old,0.000000,GTS,Undefined,100,Same Length,Yes
8161,SAP E03,173799,ARISTEIDIS BALOMENOS AND SIA OE,ARISTEIDIS BALOMENOS AND SIA OE,ARISTEIDIS BALOMENOS AND SIA OE,Old,0.000000,GTS,General Procurement,100,Same Length,Yes


In [202]:
merged_final[merged_final['Vendor name'] == 'ALL FREIGHT LOGISTICS']

Unnamed: 0,DataSource,Vendor number,Vendor name,Predicted Names/ Model Cleansed,Manual Harmonized,Vendor_Flag,Spend (USD),Segment,Primary Category,Match Ratio,Match_word_len,True Match Flag
7735,SAP E03,400374,ALL FREIGHT LOGISTICS,ALL FREIGHT LOGISTICS,ALL FREIGHT LOGISTICS,Old,0.0,GTS,Transportation,100,Same Length,Yes


In [168]:
no_of_dup_merged_records = merged_final[merged_final.duplicated()==True].shape[0] # this will give duplicates if any gets generated # subset=['Vendor name']
non_dup_merged_records = merged_final.shape[0] - no_of_dup_merged_records
print(f"No of duplicate records: {no_of_dup_merged_records} \nNo of Non duplicate records: {non_dup_merged_records}") #it is good to have "No of duplicate records:" as "0"


No of duplicate records: 0 
No of Non duplicate records: 8163


In [169]:
if no_of_dup_merged_records>0:
  merged_final.drop_duplicates(inplace=True) # Put if condition when no_of_dup_merged_records> 0, then do the drop #subset=['Vendor name'],

In [170]:
predict_df_len = merged_final.shape[0]
predict_df_len

8163

In [171]:
merged_final.isnull().sum().max()

0

In [172]:
# drop Nan from vendor names
count_na_merged_final = merged_final.isnull().sum().max()
print("Predicted Dataframe NA Count: ",count_na_merged_final)
if count_na_merged_final>0:
  na_dict  = merged_final.isnull().sum().to_dict()#:  #.max()>0
  for k,v in na_dict.items():
    if v>0:
      if (v == merged_final.isnull().sum().max()) and (k in ['DataSource',	'Vendor number',	'Vendor name']):
        print("The column with maximum NA counts: ",k)
        print("Info:   Dropping the NAs from the Dataframe:::::::::::")
        merged_final.dropna(inplace=True)
        merged_final.reset_index(inplace=True,drop=True)

#merged_final.dropna(subset = ["Vendor name"],inplace=True)  # here also put condition for NA then do Na and take the count as well

merged_final.sort_values("Spend (USD)", ascending=False,inplace=True)
merged_final.reset_index(drop=True,inplace=True)
validation_file = merged_final.copy()
validation_file.sort_values("Predicted Names/ Model Cleansed", ascending=True,inplace=True)
validation_file = validation_file[['Predicted Names/ Model Cleansed', 'Manual Harmonized', 'Vendor_Flag','DataSource', 'Vendor number', 'Vendor name','Spend (USD)','Segment','Primary Category', 'Match Ratio','Match_word_len','True Match Flag']]
validation_file.reset_index(drop=True,inplace=True)

Predicted Dataframe NA Count:  0


In [173]:
#inp_df_New_output.columns, validation_file.columns
#match_ratio_filter = 78

In [174]:
inp_df_New_output = inp_df_New_output[['Predicted Names/ Model Cleansed', 'Manual Harmonized', 'Vendor_Flag','DataSource', 'Vendor number', 'Vendor name','Spend (USD)','Segment','Primary Category', 'Match Ratio','Match_word_len','True Match Flag']]
merged_validation_file  = pd.concat([validation_file, inp_df_New_output])
merged_validation_file.sort_values("Spend (USD)", ascending=False,inplace=True)
merged_validation_file.reset_index(inplace=True,drop=True)
non_match_final_validation = merged_validation_file.copy()
non_match_final_validation = non_match_final_validation[(non_match_final_validation['Match Ratio']<00.00000000000)]#(non_match_final_validation['True Match Flag'] == 'No')] # include less than 90%
count_non_match_final_validation = non_match_final_validation.shape[0]

final_validation_file = merged_validation_file.copy()
final_validation_file = final_validation_file[(final_validation_file['True Match Flag'] != 'No') | (final_validation_file['Match Ratio']>=00.000000000000000000000) | (final_validation_file['True Match Flag'] == 'Yes')] # 90% include
print(f"Count of records in Validation file with 'Match Ration' more than 90%: {final_validation_file.shape[0]}\n"\
        f"Count of records with 'Match Ration' less than 90%: {count_non_match_final_validation}")

Count of records in Validation file with 'Match Ration' more than 90%: 9507
Count of records with 'Match Ration' less than 90%: 0


In [175]:
merged_validation_file['Spend (USD)']=merged_validation_file['Spend (USD)'].astype(float)
merged_validation_file = merged_validation_file[(merged_validation_file['True Match Flag'] != 'No') | (merged_validation_file['Match Ratio']>=00.000000000000)] # 90% include
merged_validation_file.reset_index(inplace=True,drop=True)
merged_validation_file['Spend (USD)']=merged_validation_file['Spend (USD)'].astype(float)
merged_validation_file = merged_validation_file.assign(totalSpend = merged_validation_file['Spend (USD)']).groupby('Predicted Names/ Model Cleansed').agg({'totalSpend':sum})
#merged_validation_file = merged_validation_file.assign(totalSpend = np.where(merged_validation_file['True Match Flag']!='No',merged_validation_file['Spend (USD)'],0)).groupby('Predicted Names/ Model Cleansed').agg({'totalSpend':sum})
merged_validation_file.sort_values("totalSpend", ascending=False,inplace=True)
merged_validation_file.reset_index(inplace=True)
merged_validation_file['totalSpend']=merged_validation_file['totalSpend'].astype(float)
merged_validation_file['Spend (USD)_cumulative'] = merged_validation_file['totalSpend'].cumsum()
merged_validation_file['Spend (USD)_cumulative']=merged_validation_file['Spend (USD)_cumulative'].astype(float)
merged_validation_file['totalSpend']=merged_validation_file['totalSpend'].astype(float)
merged_validation_file['Cumulative Rolling %'] = (merged_validation_file['Spend (USD)_cumulative']  / merged_validation_file['totalSpend'].sum())*100
merged_validation_file['Supplier %'] = (merged_validation_file['totalSpend']  / merged_validation_file['totalSpend'].sum())*100

merged_validation_file['Spend (USD)_cumulative'] = merged_validation_file['Spend (USD)_cumulative'].apply(lambda x: f'{x:.2f}')

merged_validation_file['Cumulative Rolling %'] = pd.to_numeric(merged_validation_file['Cumulative Rolling %'], downcast="float")
merged_validation_file['Cumulative Rolling %'] = merged_validation_file['Cumulative Rolling %'].apply(lambda x: f'{x:.2f}')

merged_validation_file['Supplier %'] = pd.to_numeric(merged_validation_file['Supplier %'], downcast="float")
merged_validation_file['Supplier %'] = merged_validation_file['Supplier %'].apply(lambda x: f'{x:.2f}')
merged_validation_file.reset_index(drop=True,inplace=True)


merged_validation_file['Group'] = ''
merged_validation_file['Cumulative Rolling %'] = pd.to_numeric(merged_validation_file['Cumulative Rolling %'], downcast="float")
merged_validation_file.loc[merged_validation_file['Cumulative Rolling %']<=80.0, ['Group']] = 1
merged_validation_file.loc[merged_validation_file['Cumulative Rolling %']>80.0, ['Group']] = 2
merged_validation_file.loc[merged_validation_file['Cumulative Rolling %']>90, ['Group']] = 3
merged_validation_file.loc[merged_validation_file['Cumulative Rolling %']>95, ['Group']] = 4
merged_validation_file.loc[merged_validation_file['Cumulative Rolling %']>98, ['Group']] = 5

merged_validation_file['Cumulative Rolling %'] = merged_validation_file['Cumulative Rolling %'].apply(lambda x: f'{float(x):.2f}')
merged_validation_file['totalSpend'] = merged_validation_file['totalSpend'].apply(lambda x: f'{float(x):.2f}')


In [176]:
merged_validation_file

Unnamed: 0,Predicted Names/ Model Cleansed,totalSpend,Spend (USD)_cumulative,Cumulative Rolling %,Supplier %,Group
0,ADI,98174717.43,98174717.43,10.27,10.27,1
1,ACCENTURE,83303109.68,181477827.11,18.98,8.71,1
2,AKF INTERNATIONAL,41529533.65,223007360.76,23.32,4.34,1
3,ARCELORMITTAL,35957979.98,258965340.74,27.08,3.76,1
4,APEX MANUFACTURING,26010552.93,284975893.66,29.80,2.72,1
...,...,...,...,...,...,...
6754,AUTOMATION R & D,-8965.00,956635805.54,100.05,-0.00,5
6755,ASSOCIATION OF NORTH AMERICAN TOOL/SUPPLIERS I...,-20137.15,956615668.39,100.05,-0.00,5
6756,AXIS COMMUNICATIONS,-21235.35,956594433.04,100.05,-0.00,5
6757,AVDEL FRANCE SAS,-29284.80,956565148.25,100.04,-0.00,5


In [177]:
final_validation_file =  pd.merge(final_validation_file,merged_validation_file[['Predicted Names/ Model Cleansed','totalSpend','Supplier %','Spend (USD)_cumulative','Cumulative Rolling %','Group']],left_on='Predicted Names/ Model Cleansed',right_on='Predicted Names/ Model Cleansed',how='left')
final_validation_file.sort_values("Predicted Names/ Model Cleansed", ascending=True,inplace=True)
final_validation_file.drop(columns=['Match Ratio', 'True Match Flag',	'Match_word_len'],inplace=True)
final_validation_file['Spend (USD)'] = final_validation_file['Spend (USD)'].apply(lambda x: f'{x:.2f}')
final_validation_file['Suggested Relationship Owner']=np.nan
final_validation_file['Correct/Incorrect']=np.nan
final_validation_file['Correct Revision'] = np.nan

final_validation_file = final_validation_file[['Predicted Names/ Model Cleansed', 'Manual Harmonized', 'Vendor_Flag',\
                                        'DataSource', 'Vendor number', 'Vendor name', 'Spend (USD)',\
                                        'totalSpend','Supplier %','Spend (USD)_cumulative', 'Cumulative Rolling %',\
                                        'Group','Segment','Primary Category','Suggested Relationship Owner','Correct/Incorrect','Correct Revision']]
final_validation_file.reset_index(drop=True,inplace=True)


In [178]:
#final_validation_file.tail(5)
#final_validation_file['Vendor name'].nunique()

In [179]:
final_validation_records = final_validation_file.shape[0]
final_validation_records

9507

In [180]:
#((326911619.44+85615877.26+206579069.43)/1806555966.77)*100

In [181]:
model_chamber_records = merged_final.shape[0]
model_chamber_records


8163

In [182]:
cleansed_records = inp_df_New_output.shape[0]
cleansed_records

1344

# Accuracy:

In [183]:
true_matched_records = merged_final[merged_final['True Match Flag'] == 'Yes'].shape[0]
true_matched_records

7648

In [184]:
model_accuracy = (true_matched_records/model_chamber_records)*100
model_accuracy

93.69104495896117

In [185]:
date = datetime.now().strftime("%Y_%b_%d")

In [186]:
final_validation_file.to_excel('Validation.xlsx',index=False,encoding='utf8')
files.download('Validation.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [187]:
# Send the ouput prredicted file to ouput folder
#merged_final.to_excel(f'C:/Users/PXC0909/OneDrive - Stanley Black & Decker/Desktop/SBD/Vendor Harmonization/VH_Data/Predicted/merged_final_{date}.xlsx',index=False,encoding='utf_8_sig')
merged_final.to_excel('Prediction.xlsx',index=False,encoding='utf8')
files.download('Prediction.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [188]:
# Send the new/ Ambiguous vendor output file to conformation folder
#inp_df_New_output.to_excel(f'C:/Users/PXC0909/OneDrive - Stanley Black & Decker/Desktop/SBD/Vendor Harmonization/VH_Data/Cleansed/Cleansed_Vendors_{date}.xlsx',index=False,encoding='utf_8_sig')
inp_df_New_output.to_excel(f'Cleansed_Vendors.xlsx',index=False,encoding='utf8')
files.download('Cleansed_Vendors.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [189]:
#inp_df_New_output.head()

In [190]:
#merged_final.shape

In [191]:
process_end_time =  datetime.now()

In [192]:
process_time_taken = process_end_time - process_start_time
process_time_taken = time_util(process_time_taken)

In [193]:
print(f'Time taken to execute complete script: {process_time_taken}')

Time taken to execute complete script: 129 mins & 25 secs


In [194]:
#mytext = 'The program is executed' #+str(process_time_taken)
#language = 'en'
#myobj = gTTS(text=mytext, lang=language, slow=False)
#myobj.save("time_taken.mp3")
#os.system("time_taken.mp3")

In [195]:
# Code to get the full information on data

# Information of Raw master data:::
# No of master files used:
print('No of master files/ file used: ',no_of_files)

# Total no of records in master data (including all type of records)
print("Total no of records: ", length)

# Total no of null records (repeatative cleaning)
tot_no_of_null_records = no_of_null_records1 + no_of_null_records2 + no_of_null_records3 
print("Total no of Null records: ",tot_no_of_null_records)

# No of bad/uncleaned master data:
print(f"No of bad/filtered/uncleaned master data: ", no_of_noise_flags)

# Total no of dropped records
no_of_dropped_records = tot_no_of_null_records + no_of_noise_flags
print("Total no of records removed: ",no_of_dropped_records)

# Total no of clean records:
no_of_clean_records =  length - no_of_dropped_records
print("No of Clean records:", no_of_clean_records)

# Total no of unique bad/uncleaned:
tot_no_of_unique_dirty_records = no_of_unique_dirty_records
print("No of (non-duplicate) records to clean: ", tot_no_of_unique_dirty_records)

# Total no of master records in use
print("No of records used as Master data: ", tot_no_of_clean_records)

# Percentage of unique bad/uncleaned records
print("Percentage of Foreign/ Dirty records: ", per_irr)


No of master files/ file used:  1
Total no of records:  228994
Total no of Null records:  121233
No of bad/filtered/uncleaned master data:  0
Total no of records removed:  121233
No of Clean records: 107761
No of (non-duplicate) records to clean:  0
No of records used as Master data:  374639
Percentage of Foreign/ Dirty records:  0.000


In [196]:
# Information of Input data:::
# No of Input files used
print('No of input files used: ', tot_files)
tot_files

# Total no of records in input data (including all type of records)
print('No of records in input data: ', no_of_raw_inp_records)

# Unique Flags:
print("Unique Flags are: ",unique_vendor_flag(n))

# Duplicate records:
print("No of duplicate records: ",count_dup)

# No of Null records
tot_no_of_null_inp_records = no_of_null_inp_records1 + no_of_null_inp_records2 
print('Total no of null records to drop: ', tot_no_of_null_inp_records)

# Foreign counts:
if (foreign_count == 0) and (foreign_count_raw>0):
  print(f'Count of Foreign name/names Translated: {foreign_count_raw}')

# No of new records:
no_of_new_records = new_records
print('No of "New" records: ',no_of_new_records)


# No of old records:
no_of_old_records = old_records
print('No of "Old" records: ',no_of_old_records)

# No of fit old records:
no_of_fit_old_records
#print('No of fit old records: ', no_of_fit_old_records)

# No of unfit old records:
#no_of_unfit_old_records
#print('No of unfit old records: ', no_of_unfit_old_records)

# total no of clean input records
print('Total no of clean inp records: ',tot_no_of_clean_inp_records)

# total no of Unique vendors
print('Total no of Unique vendors: ',tot_no_of_unique_vendors)


# After Run:
# Total no of records as output from Model chamber:
print("No of records predicted: ", model_chamber_records)

# Total no of records as output from cleansing chamber:
print("No of records cleansed: ", cleansed_records)

# For Validaton file:
print("No of records in Validation file having 'Match Ratio' more than 0%: ", final_validation_records)
print("No of records not in Validation file having 'Match Ratio' less than 0%: ", count_non_match_final_validation)

# Model Accuracy:
print("Model accuracy: ",model_accuracy)

No of input files used:  1
No of records in input data:  9507
Unique Flags are:  ('New', 'Old')
No of duplicate records:  0
Total no of null records to drop:  0
Count of Foreign name/names Translated: 92
No of "New" records:  1344
No of "Old" records:  8163
Total no of clean inp records:  9507
Total no of Unique vendors:  8491
No of records predicted:  8163
No of records cleansed:  1344
No of records in Validation file having 'Match Ratio' more than 0%:  9507
No of records not in Validation file having 'Match Ratio' less than 0%:  0
Model accuracy:  93.69104495896117


# End of Script

# Below snippets are for testing purpose only

In [None]:
"""
import unicodedata
vn = "ł"
def norm_vendorNames1(vname):
  normalized = unicodedata.normalize('NFD', vname)
  norm_vname = u"".join([c for c in normalized if not unicodedata.combining(c)])
  norm_vname = trans_name(norm_vname)
  return(norm_vname)

print(norm_vendorNames1(vn))
danish_text_raw = 'ø' # here you would load your text
print(danish_text_raw) # returns string
danish_text = danish_text_raw.encode('utf-8').decode() 
print(danish_text)
def to_unicode(in_string):
  if isinstance(in_string,str):
    out_string = in_string.encode().decode('utf-8')
  elif isinstance(in_string,unicode):
    out_string = in_string
  else:
    raise TypeError('not stringy')
  return out_string
to_unicode(danish_text_raw)
import unicodedata as ud

def rmdiacritics(name):
    '''
    Return the base character of char, by "removing" any
    diacritics like accents or curls and strokes and the like.
    '''
    char_list = []
    for char in name:
      desc = ud.name(char)
      #print(desc)
      cutoff = desc.find(' WITH ')
      #print(cutoff)
      if cutoff == -1:
        char_list.append(char)
      elif cutoff != -1:
        desc = desc[:cutoff]
        #print(desc)
        try:
          char = ud.lookup(desc)
          #print(char)
          char_list.append(char)
        except KeyError:
            pass  # removing "WITH ..." produced an invalid name
    
    return ''.join(char_list)

rmdiacritics('BYMILJØETATEN')
!pip install unidecode
from unidecode import unidecode
from unidecode import unidecode
text = "E & Y K÷NYVVIZSGßLO KFT."
print(unidecode(text))
def decode_vnames(vname):
  return(unidecode(vname))
#decode_vnames(s)
decode_vnames('永恒力合力工业车辆租赁（上海）有限公司')
'THEO’S PRESTIGE PAINT & PANE'
s = 'Andr\xE9'
s.encode().decode('utf-8', 'replace')
u = s.encode('latin-1','replace').decode('ascii','replace')#.encode('utf-8', 'replace')#.decode('ascii','replace')
f= u.encode(encoding='utf-8', errors='strict')
u
d = u.encode('utf-8','replace').decode('utf-8','replace')
d
g = bytes(s, 'utf-8')
g= g.decode('ascii','replace')
g
type(f),f
h = f.decode('utf-8','replace').encode().decode('utf-8','replace')
h.encode().decode('utf-8', 'replace')
"""

'\nimport unicodedata\nvn = "ł"\ndef norm_vendorNames1(vname):\n  normalized = unicodedata.normalize(\'NFD\', vname)\n  norm_vname = u"".join([c for c in normalized if not unicodedata.combining(c)])\n  norm_vname = trans_name(norm_vname)\n  return(norm_vname)\n\nprint(norm_vendorNames1(vn))\ndanish_text_raw = \'ø\' # here you would load your text\nprint(danish_text_raw) # returns string\ndanish_text = danish_text_raw.encode(\'utf-8\').decode() \nprint(danish_text)\ndef to_unicode(in_string):\n  if isinstance(in_string,str):\n    out_string = in_string.encode().decode(\'utf-8\')\n  elif isinstance(in_string,unicode):\n    out_string = in_string\n  else:\n    raise TypeError(\'not stringy\')\n  return out_string\nto_unicode(danish_text_raw)\nimport unicodedata as ud\n\ndef rmdiacritics(name):\n    \'\'\'\n    Return the base character of char, by "removing" any\n    diacritics like accents or curls and strokes and the like.\n    \'\'\'\n    char_list = []\n    for char in name:\n      de