# Table of contents
1. Function for initial EDA (initial_eda(df))
2. Function to add target column (add_target_column(df))
3. Function for train/test split (train_test_split(X_train))
4. Function to remove stopwords (remove_stopwords(X_train))
5. Function for lemmatization (perform_lemmatization(X_train))
6. Function to remove single character words (remove_single_char_words(X_train))
7. Function for RF model (Random_Forest_CV(X_train, Y_train))
8. Function for LR model (Logistic_Regression_CV(X_train,Y_train))

9. Function to add fuzzy score
10. Function for LR predict
11. Function for RF predict

In [19]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")

In [20]:
# df= pd.read_csv('/data/DS_INTERN/data/RAW_DATA/train_test_data.csv', low_memory=False)

# df1 = add_target_column(df)

# train_df, test_df = train_test_split(df1)

In [21]:
def initial_eda(df):
    if isinstance(df, pd.DataFrame):
        total_na = df.isna().sum().sum()
        print("Dimensions : %d rows, %d columns" % (df.shape[0], df.shape[1]))
        print("Total NA Values : %d " % (total_na))
        print("%38s %10s     %10s %10s" % ("Column Name", "Data Type", "#Distinct", "NA Values"))
        col_name = df.columns
        dtyp = df.dtypes
        uniq = df.nunique()
        na_val = df.isna().sum()
        for i in range(len(df.columns)):
            print("%38s %10s   %10s %10s" % (col_name[i], dtyp[i], uniq[i], na_val[i]))
        
    else:
        print("Expect a DataFrame but got a %15s" % (type(df)))

In [22]:
def add_target_column(df):
    m1 = df['REVISED_CATEGORY'].isna()
    m2 = df['REVISED_VARIETY'].isna()

    df['TARGET'] = np.select([m1, m2, m1 & m2], 
                            [df['REVISED_CATEGORY'], df['REVISED_VARIETY'], np.nan], 
                            default=df['REVISED_CATEGORY'] + '_' + df['REVISED_VARIETY'])
    
    return df

In [2]:
def train_test_split(df):
    train_df = df.dropna(subset=['TARGET'])
    test_df = df[df['TARGET'].isna()]
    
    return train_df, test_df

In [2]:
def get_train_data(df):
    train_df = df.dropna(subset=['TARGET'])
    
    return train_df

In [1]:
def get_test_data(df):
    test_df = df[df['TARGET'].isna()]
    
    return test_df

In [4]:
def perform_preprocessing(df):
    
    df['FEAT1'] = pd.Series(remove_special_characters(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(perform_spell_correction_manual(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(perform_spell_correction_walmart(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(remove_stopwords(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(remove_numbers(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(perform_lemmatization(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(remove_two_and_single_char_words(df['FEAT1']), index=df.index)
    df['FEAT1'] = pd.Series(remove_noise_words(df['FEAT1']), index=df.index)
    
    return df

In [3]:
def perform_preprocessing_series(s):
    
    s = remove_special_characters(s)
    s = perform_spell_correction_manual(s)
    s = perform_spell_correction_walmart(s)
    s = remove_stopwords(s)
    s = remove_numbers(s)
    s = perform_lemmatization(s)
    s = remove_two_and_single_char_words(s)
    s = remove_noise_words(s)
    
    return s

In [2]:
def remove_special_characters(s):
    
    import re
    
    import re
    
    s = s.apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ',x))
                
    return s

In [25]:
def remove_stopwords(s):
    
    '''Remove the stopwords from the input series and return the modified series'''
    
    from nltk.corpus import stopwords
    #from nltk import download
    #download('stopwords')  # Download stopwords list.
    stop_words = stopwords.words('english')
    
    s = s.apply(lambda x: ' '.join([word.upper() for word in x.lower().split() if word not in (stop_words)]))
    
    return s

In [26]:
def remove_numbers(s):
    
    import re
    
    pattern = '[0-9]'
    s = s.apply(lambda x: re.sub(pattern, "", x)) 
    
    return s

In [27]:
def remove_noise_words(s):
    
    noise_list = ['UNASSIGNED', 'DOTCOM ONLY', 'DOTCOM', 'DOTS COM', 'DSV',
                  'ONLINE ONLY', 'ONLINE',  'DO NOT USE', 'STATE FEES',
                                        'STATE FEE', 'REDUCED PROGRAM','REDUCED',
                                        'ITEM', 'NON TAXABLE', 'TAXABLE',
                                        'DELETE', 'PROMO', 'EMPTY FINELINE',
                                        'EMPTY FINELINES', 'TEMP', 'XTEMP',
                                        'CVP', 'CUSTOMER VALUE PROGRAM',
                                        'FINELINE', 'UNKNOWN', 'DESCRIPTION',
                                        'WALMART', 'NULL', 'NA', 'DOT COM',
                                        'PR', 'HI', 'AK', 'OPEN', 'BLANK',
                                        'PUERTO', 'RICO', 'HAWAII', 'ALASKA',
                                        'AND', 'THE', 'IC', 'MERCHANDISE',
                                        'TO BE DELETED1', 'TO BE DELETED', 'DELETED',
                                        'DISCOUNT', 'COUPON', 'FULL WM', 'REVENUE',
                                        'DEPT', 'UNTRANSLATABLE','ONLY',
                                        '1ST','2ND','MD','MARKDOWN','UNBRANDED']
    
    s = s.apply(lambda x: ' '.join([word for word in x.split() if word not in (noise_list)]))
    
    return s


In [28]:
def perform_spell_correction_manual(s):
    
    dictionary = {'ALMD':'ALMOND','ARTICHOKES':'ARTICHOKE','BACN':'BACON','BLEU':'BLUE',
                 'CARTN':'CARTON','CCANDY':'CANDY','CARNS':'CARN',
                 'CHICKN':'CHICKEN','CHILIES':'CHILI','CHNKS':'CHUNK','CHNKY':'CHUNKY','CHOC':'CHOCOLATE','CHOPPD':'CHOPPED',
                 'CLEMS':'CLEM','COBB':'COB','COKCATAIL':'COCKTAIL','CONCORD':'CONCORDE','CORROGATED':'CORRUGATED',
                 'CRSP':'CRISP','ENG':'ENGLISH','ELEVTE':'ELEVATE','FLVRD':'FLAVORED',
                 'FLAVRD':'FLAVORED','JUICI':'JUICE','JUICING':'JUICE','MINNEOLAS':'MINNEOLA','PINKLDY':'PINKLADY',
                 'SUNFLWER':'SUNFLOWER','SUPERSEEDZ':'SUPERSEEDS','TOVS':'TOV','TROPICALS':'TROPICAL',
                 }

    
    s = s.apply(lambda x: ' '.join([dictionary.get(word,word) for word in x.split()]))
    
    return s


In [29]:
def perform_spell_correction_walmart(s):
    
    spell = pd.read_csv('all_corrections_new.csv', low_memory=False)
    
    spell = spell[spell['SBU_SP']=='FOOD_PRODUCE & FLOWERS'][['key','value']]
    dictionary = dict(zip(spell.key, spell.value))
    
    s = s.apply(lambda x: ' '.join([dictionary.get(word,word) for word in x.split()]))
    
    return s

In [30]:
def remove_single_char_words(s):
    
    '''Remove the single character words like C,N,K,etc from the input series.
       It will not remove the single digit numbers like 2,3,4,etc'''
    import re 
    
    s = s.apply(lambda x: re.sub(r"\b[a-zA-Z]\b", "", x))
    
    return s

In [31]:
def remove_two_and_single_char_words(s):
    
    import re 
    
    s = s.apply(lambda x: re.sub(r'\b\w{1,2}\b', '', x))

    return s

In [32]:
def lem_greater_than_3(word):
    
    from nltk.stem.wordnet import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()
    
    if len(word)>3:
        return lmtzr.lemmatize(word)
    else :
        return word

In [33]:
def perform_lemmatization(s):
    
    from nltk.stem.wordnet import WordNetLemmatizer
    
    # TOKENIZATION
    s = s.apply(lambda x: [word.lower() for word in x.split()])

    # LEMMATIZATION
    lmtzr = WordNetLemmatizer()
    s = s.apply(lambda lst:[lem_greater_than_3(word) for word in lst])
    
    # Join
    s = s.apply(lambda x: ' '.join([word.upper() for word in x]))
    
    return s

In [34]:
def perform_lemmatization_all(s):
    
    from nltk.stem.wordnet import WordNetLemmatizer
    
    # TOKENIZATION
    s = s.apply(lambda x: [word.lower() for word in x.split()])

    # LEMMATIZATION
    lmtzr = WordNetLemmatizer()
    s = s.apply(lambda lst:[lmtzr.lemmatize(word) for word in lst])
    
    # Join
    s = s.apply(lambda x: ' '.join([word.upper() for word in x]))
    
    return s

In [35]:
def Random_forest_CV(X_train, Y_train):
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    import time
    
    # metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    from sklearn.model_selection import cross_validate
    
    start = time.time()

    pipe = Pipeline([('vect',TfidfVectorizer()),
                    ('clf', RandomForestClassifier(class_weight='balanced'))])
    
    scores = cross_validate(pipe, X_train, Y_train, scoring= ('accuracy','precision_micro','recall_micro'), cv=5)

    print('Random Forest Classifier performance on cross validation')
    print("Accuracy : {:0.5f}".format(scores['test_accuracy'].mean()))
    print("Precision_micro : {:0.5f}".format(scores['test_precision_micro'].mean()))
    print("Recall_micro : {:0.5f}".format(scores['test_recall_micro'].mean()))
    print('time', time.time() - start, '\n\n')
    
    pass

In [36]:
def Logistic_Regression_CV(X_train, Y_train):
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    import time
    
    # metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    from sklearn.model_selection import cross_validate
    
    start = time.time()

    pipe = Pipeline([('vect',TfidfVectorizer()),
                    ('clf', LogisticRegression(class_weight='balanced'))])
    
    scores = cross_validate(pipe, X_train, Y_train, scoring= ('accuracy','precision_micro','recall_micro'), cv=5)

    print('Logistic Regression Classifier performance on cross validation')
    print("Accuracy : {:0.5f}".format(scores['test_accuracy'].mean()))
    print("Precision_micro : {:0.5f}".format(scores['test_precision_micro'].mean()))
    print("Recall_micro : {:0.5f}".format(scores['test_recall_micro'].mean()))
    print('time', time.time() - start, '\n\n')
    
    pass

In [37]:
def pipeline_LR():
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    pipe = Pipeline([('vect',TfidfVectorizer()),
                    ('clf', LogisticRegression(class_weight='balanced'))])
    
    return pipe

In [38]:
def pipeline_RF():
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    import time


    pipe = Pipeline([('vect',TfidfVectorizer()),
                    ('clf', RandomForestClassifier(class_weight='balanced'))])
     
    return pipe

In [3]:
def pipeline_LR_HP(max_iter):
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline


    pipe = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                    ('clf', LogisticRegression(C=250, penalty='l1',solver='saga',class_weight='balanced',max_iter= max_iter))])
    
    return pipe

In [40]:
def pipeline_RF_HP():
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    import time


    pipe = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                    ('clf', RandomForestClassifier(n_estimators=220,class_weight='balanced'))])
     
    return pipe

In [41]:
def analysis_LR(pipe, train_df):
    
    import pandas as pd
    
    df1 = train_df['TARGET'].value_counts().rename_axis('classes').reset_index(name = 'No. of data')
    
    tt = pd.DataFrame(pipe['clf'].classes_, columns=['classes'])
    names = []
    coeff = []
    for i in range(533):
        m = max(abs(pipe['clf'].coef_[i].max()), abs(pipe['clf'].coef_[i].min()))
    
        coeff.append(pipe['clf'].coef_[i][np.where(abs(pipe['clf'].coef_[i]) == m)[0][0]])
        names.append(pipe['vect'].get_feature_names()[np.where(abs(pipe['clf'].coef_[i]) == m)[0][0]])
        
    tt['most_imp_feat'] = names
    tt['coeff'] = coeff
    
    tt = pd.merge(tt, df1, how='left', on=['classes'])
    
    return tt

In [3]:
def pipeline_voting_hard(solver, penalty, random_state, max_iter):
    
    from sklearn.ensemble import VotingClassifier, RandomForestClassifier
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    # Build the individual models
    
    pipeline_RF = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                             ('clf', RandomForestClassifier(n_estimators= 220,class_weight='balanced', 
                                                            random_state=random_state))])

    pipeline_LR = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                        ('clf', LogisticRegression(C=250, penalty=penalty, solver=solver,class_weight='balanced',
                                                   random_state=random_state, max_iter= max_iter))])

    
    # List of (string, estimator) tuples
    estimators = [('RF', pipeline_RF), ('LR', pipeline_LR)]

    # Build and fit an averaging classifier
    pipe = VotingClassifier(estimators = estimators)
    
    return pipe


In [2]:
def pipeline_voting_hard_3(solver, random_state, max_iter,kernel):
    
    from sklearn.ensemble import VotingClassifier, RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    # Build the individual models
    
    pipeline_RF = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                             ('clf', RandomForestClassifier(n_estimators= 220,class_weight='balanced',
                                                           random_state=random_state))])

    pipeline_LR = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                        ('clf', LogisticRegression(C=250, penalty='l1',solver=solver,class_weight='balanced',
                                                   max_iter= max_iter,random_state=random_state))])
    
    pipeline_SVM = Pipeline([('vect', TfidfVectorizer(min_df=10)),
                            ('clf', SVC(class_weight='balanced', kernel=kernel, random_state=random_state))])
    
    # List of (string, estimator) tuples
    estimators = [('RF', pipeline_RF), ('LR', pipeline_LR), ('SVM', pipeline_SVM)]

    # Build and fit an averaging classifier
    pipe = VotingClassifier(estimators = estimators)
    
    return pipe


In [3]:
def pipeline_voting_soft(max_iter):
    
    from sklearn.ensemble import VotingClassifier, RandomForestClassifier
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    # Build the individual models
    pipeline_LR = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                        ('clf', LogisticRegression(C=250, penalty='l1',solver='saga',class_weight='balanced',max_iter= max_iter))])

    pipeline_RF = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                             ('clf', RandomForestClassifier(n_estimators= 220,class_weight='balanced'))])

    # List of (string, estimator) tuples
    estimators = [('RF', pipeline_RF), ('LR', pipeline_LR)]

    # Build and fit an averaging classifier
    pipe = VotingClassifier(estimators = estimators, voting='soft')
    
    return pipe

In [4]:
def pipeline_voting_soft_3(max_iter, kernel):
    
    from sklearn.ensemble import VotingClassifier, RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    # Build the individual models
    pipeline_LR = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                        ('clf', LogisticRegression(C=250, penalty='l1',solver='saga',class_weight='balanced',max_iter= max_iter))])

    pipeline_RF = Pipeline([('vect',TfidfVectorizer(min_df=10)),
                             ('clf', RandomForestClassifier(n_estimators= 220,class_weight='balanced'))])
    
    pipeline_SVM = Pipeline([('vect', TfidfVectorizer(min_df=10)),
                            ('clf', SVC(class_weight='balanced', kernel=kernel))])

    # List of (string, estimator) tuples
    estimators = [('RF', pipeline_RF), ('LR', pipeline_LR), ('SVM', pipeline_SVM)]

    # Build and fit an averaging classifier
    pipe = VotingClassifier(estimators = estimators, voting='soft')
    
    return pipe

In [44]:
def perform_stemming(s):
    
    import nltk
    from nltk.stem.snowball import SnowballStemmer

    
    # TOKENIZATION
    s = s.apply(lambda x: [word.lower() for word in x.split()])

    #the stemmer requires a language parameter
    snow_stemmer = SnowballStemmer(language='english')
    s = s.apply(lambda lst:[snow_stemmer.stem(word) for word in lst])
    
    # Join
    s = s.apply(lambda x: ' '.join([word.upper() for word in x]))
    
    return s

In [45]:
def perform_spellcheck(s):
    
    from spellchecker import SpellChecker

    
    # TOKENIZATION
    s = s.apply(lambda x: [word.lower() for word in x.split()])

    # SPELLCHECKER
    spell = SpellChecker()
    
    s = s.apply(lambda lst:[spell.correction(word) for word in lst])
    
    # Join
    s = s.apply(lambda x: ' '.join([word.upper() for word in x]))
    
    return s

In [46]:
def perform_textblob(s):
    
    from textblob import Word

    
    # TOKENIZATION
    s = s.apply(lambda x: [word.lower() for word in x.split()])

    
    s = s.apply(lambda lst:[Word(word).spellcheck()[0][0] for word in lst])
    
    # Join
    s = s.apply(lambda x: ' '.join([word.upper() for word in x]))
    
    return s

In [1]:
def trust_factor(test_df):
    
    import Levenshtein as lev
    from fuzzywuzzy import fuzz
    
#    test_df['Predicted_target'] = pd.Series(Y_pred, index= test_df.index)
    test_df[['category','variety']] = test_df['Predicted_target'].str.split('_', expand=True)
#    test_df.drop(['Predicted_target'], axis=  1, inplace = True)
    
    test_df['category'] = pd.Series(perform_spell_correction_manual(test_df['category']), index=test_df.index)
    test_df['variety'] = pd.Series(perform_spell_correction_manual(test_df['variety']), index=test_df.index)
    
    test_df['category'] = pd.Series(perform_spell_correction_walmart(test_df['category']), index=test_df.index)
    test_df['variety'] = pd.Series(perform_spell_correction_walmart(test_df['variety']), index=test_df.index)
    
    
    test_df['category'] = pd.Series(perform_lemmatization(test_df['category']), index=test_df.index)
    test_df['variety'] = pd.Series(perform_lemmatization(test_df['variety']), index=test_df.index)
    
    
    test_df['fuzzy_category'] = test_df.apply(lambda x: fuzz.partial_token_set_ratio(x['FEAT1'],x['category']), axis=1)
    test_df['fuzzy_variety'] = test_df.apply(lambda x: fuzz.partial_token_set_ratio(x['FEAT1'],x['variety']), axis=1)
    
    test_df['TF_cat'] = test_df.apply(lambda x: 1 if x['fuzzy_category']>=80 else 0, axis=1)
    test_df['TF_var'] = test_df.apply(lambda x: 1 if x['fuzzy_variety']>=80 else 0, axis=1)
    
    test_df['TF'] = np.where((test_df['TF_cat'] == 1) | (test_df['TF_var'] ==1),1, 0)
    
    return test_df

In [48]:
def data_augmentation_simple_wordnet(train_df, n_per_original):
    
    import random
    import snorkel
    import nltk
    from nltk.corpus import wordnet as wn

    from snorkel.augmentation import transformation_function

    nltk.download("wordnet", quiet=True)


    def get_synonyms(word):
        """Get the synonyms of word from Wordnet."""
        lemmas = set().union(*[s.lemmas() for s in wn.synsets(word)])
        return list(set(l.name().lower().replace("_", " ") for l in lemmas) - {word})


    @transformation_function()
    def tf_replace_word_with_synonym(x):
        """Try to replace a random word with a synonym."""
        words = [w.lower() for w in x['FEAT1'].split()]    
        idx = random.choice(range(len(words)))
        synonyms = get_synonyms(words[idx])
        if len(synonyms) > 0:
            x['FEAT1'] = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])

            x['FEAT1'] = ' '.join([w.upper() for w in x['FEAT1'].split()])

            return x
        
    from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier


    tf_policy = ApplyOnePolicy(n_per_original= n_per_original, keep_original=True)
    tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)


    #tt = pd.DataFrame(train_df['FEAT1'])
    train_df_augmented = tf_applier.apply(train_df)
    
    return train_df_augmented

In [1]:
def data_augmentation_simple_walmart(train_df, n_per_original):
    
    import random
    import re

    from snorkel.augmentation import transformation_function
    
    # csv file from Walmart
    syn_df = pd.read_csv('synonyms_v1.csv', sep=',')


    def get_synonyms(word):
        """Get the synonyms of word from Walmart."""
        l = ((syn_df[syn_df['WORD']== word]['SYNONYMS']))

        if l.shape[0] == 0:
            return []
        else :

#             l= l.iloc[0]
#             # to convert string to list
#             Syn_List = re.sub("[^\w]", " ",  l).split()
            Syn_List = l.iloc[0].split(',')

            if len(Syn_List):
                idx = random.choice(range(len(Syn_List)))
                return (Syn_List[idx])


    @transformation_function()
    def tf_replace_word_with_synonym(x):
        """Try to replace a random word with a synonym."""
        words = [w for w in x['FEAT1'].split()]    
        idx = random.choice(range(len(words)))
        synonyms = get_synonyms(words[idx])
        if len(synonyms) > 0:
            x['FEAT1'] = " ".join(words[:idx] + [synonyms] + words[idx + 1 :])
            return x
        
    from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

    tf_policy = ApplyOnePolicy(n_per_original=n_per_original, keep_original=True)
    tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
    train_df_augmented = tf_applier.apply(train_df)
    
    return train_df_augmented

In [2]:
def data_augmentation_complex_walmart(train_df, n_per_original,sequence_length, verb, noun, adjective):
    
    import nltk
    import re
    from nltk.corpus import wordnet as wn
    nltk.download("wordnet")
    from snorkel.augmentation import transformation_function
    from snorkel.preprocess.nlp import SpacyPreprocessor
    spacy = SpacyPreprocessor(text_field="FEAT1", doc_field="doc", memoize=False)

    # csv from Walmart
    syn_df = pd.read_csv('synonyms_v1.csv', sep=',')


    def get_synonym(word, pos=None):
        """Get the synonyms of word from Walmart."""

        l = ((syn_df[syn_df['WORD']== word]['SYNONYMS']))

        if l.shape[0] == 0:
            return []
        else :

#             l= l.iloc[0]

#             Syn_List = re.sub("[^\w]", " ",  l).split()
            Syn_List = l.iloc[0].split(',')

            if len(Syn_List):
                idx = random.choice(range(len(Syn_List)))
                return (Syn_List[idx])


    def replace_token(spacy_doc, idx, replacement):
        """Replace token in position idx with replacement."""
        p= " ".join([spacy_doc[:idx].text, replacement,spacy_doc[1 + idx :].text])
        return p


    @transformation_function(pre=[spacy])
    def replace_verb_with_synonym(x):
        # Get indices of verb tokens in sentence.

        verb_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "VERB"]

        if verb_idxs:
            # Pick random verb idx to replace.
            idx = np.random.choice(verb_idxs)
            synonym = get_synonym(x.doc[idx].text, pos="v")

            # If there's a valid verb synonym, replace it. Otherwise, return None.
            if synonym:

                x.FEAT1 = replace_token(x.doc, idx, synonym.upper())
                return x


    @transformation_function(pre=[spacy])
    def replace_noun_with_synonym(x):

        # Get indices of noun tokens in sentence.
        noun_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "NOUN"]
        if noun_idxs:
            
            # Pick random noun idx to replace.
            idx = np.random.choice(noun_idxs)
            synonym = get_synonym(x.doc[idx].text, pos="n")

            # If there's a valid noun synonym, replace it. Otherwise, return None.
            if synonym:
                x.FEAT1 = replace_token(x.doc, idx, synonym.upper())

                return x


    @transformation_function(pre=[spacy])
    def replace_adjective_with_synonym(x):
        # Get indices of adjective tokens in sentence.
        adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
        if adjective_idxs:
            
            # Pick random adjective idx to replace.
            idx = np.random.choice(adjective_idxs)
            synonym = get_synonym(x.doc[idx].text, pos="a")
    
            # If there's a valid adjective synonym, replace it. Otherwise, return None.
            if synonym:
                x.FEAT1 = replace_token(x.doc, idx, synonym.upper())
                return x
           
    from snorkel.augmentation import PandasTFApplier, MeanFieldPolicy

    tfs = [
        replace_verb_with_synonym,
        replace_noun_with_synonym,
        replace_adjective_with_synonym,
    ]

    mean_field_policy = MeanFieldPolicy(
        len(tfs),
        sequence_length=sequence_length,
        n_per_original=n_per_original,
        keep_original=True,
        p=[verb, noun, adjective]
    )
    
    tf_applier = PandasTFApplier(tfs, mean_field_policy)
    train_df_augmented = tf_applier.apply(train_df)
    
    return train_df_augmented

In [2]:
def item1_desc_mapping(train_df, test_df):
    
    # first count the unique target on grouping
    tt = train_df.groupby(['ITEM1_DESC'], sort=False)['TARGET'].agg([('count_distinct_target', 'nunique')]).reset_index()

    # select only the ones with count=1
    ttt= tt[(tt['count_distinct_target']==1)]

    # list of ITEM1_DESC
    train_list_item = []
    for i in range(len(ttt)):
        train_list_item.append(ttt.iloc[i]['ITEM1_DESC'])

    #list of ITEM1_DESC for test
    test_list_item = []
    for i in range(len(test_df)):
        test_list_item.append(test_df.iloc[i]['ITEM1_DESC'])

    common_list = list(set(train_list_item).intersection(set(test_list_item)))
    
    # ITEM1_DESC mapping and updating the predicted_target

    for i in range(len(common_list)):
        item = common_list[i]
        j = test_df.iloc[np.where(test_df['ITEM1_DESC']==item)].index
        for k in range(len(j)):
            test_df.loc[j[k]]['Predicted_target'] = train_df[train_df['ITEM1_DESC']==item]['TARGET'].iloc[0]
            
    return 

In [5]:
import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz, process
# Import module for iteration
import itertools
# Import module for function development
from typing import Union, List, Tuple
# Import module for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Import module for cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# Import module for KNN
from sklearn.neighbors import NearestNeighbors

# String matching - TF-IDF
def build_vectorizer(
    clean: pd.Series,
    analyzer: str = 'char', 
    ngram_range: Tuple[int, int] = (1, 4), 
    n_neighbors: int = 1, 
    **kwargs
    ) -> Tuple:
    # Create vectorizer
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
    X = vectorizer.fit_transform(clean.values.astype('U'))

    # Fit nearest neighbors corpus
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
    return vectorizer, nbrs

# String matching - KNN
def tfidf_nn(
    messy, 
    clean, 
    n_neighbors = 1, 
    **kwargs
    ):
    # Fit clean data and transform messy data
    vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
    input_vec = vectorizer.transform(messy)

    # Determine best possible matches
    distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
    nearest_values = np.array(clean)[indices]
    return nearest_values, distances

def fn_get_ratio(ratio_type, partial=False):
    
    if ratio_type == 'set' and partial == False:
        return fuzz.token_set_ratio
    elif ratio_type == 'set' and partial == True:
        return fuzz.partial_token_set_ratio
    elif ratio_type == 'sort' and partial == False:
        return fuzz.token_sort_ratio
    elif ratio_type == 'sort' and partial == True:
        return fuzz.partial_token_sort_ratio
    else:
        return("Please provide a valid combination of ratio_type and partial parameter")
    
# String matching - match fuzzy
def find_matches_fuzzy(
    row, 
    match_candidates,
    partial, 
    ratio_type,
    limit = 5
    
    ):
    
    scorer_fn = fn_get_ratio(partial = partial, ratio_type = ratio_type)
    row_matches = process.extract(
        row, dict(enumerate(match_candidates)), 
#         scorer = fuzz.token_sort_ratio, 
        scorer = scorer_fn,
        limit = limit
        )
    result = [(row, match[0], match[1]) for match in row_matches]
    return result

def fuzzy_nn_match1(
    messy,
    clean,
    column,
    col,partial, ratio_type,
    n_neighbors = 100,
    limit = 1,  **kwargs):
    nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)

    results = [find_matches_fuzzy(row, nearest_values[i], partial, ratio_type, limit ) for i, row in enumerate(messy)]
    
    return ((results))

def fuzzy_tfidf_feat1(train_df, test_df, partial, ratio_type):
    
    dff = fuzzy_nn_match1(test_df['FEAT1'], train_df['FEAT1'],'FEAT1', 'Result', partial=partial, ratio_type=ratio_type)
    
    major_list = []

    for row in range(len(dff)):
        major_list.append(pd.DataFrame(dff[row], columns = ['FEAT1_test','FEAT1_train', 'Ratio']))
        
    major_df = pd.concat(major_list)
    major_df.reset_index(inplace=True)
    major_df.drop(['index'],axis=1, inplace=True)
    
#     print('No. of observations with Ratio greater than threshold:',len(major_df[major_df['Ratio']>=ratio_threshold]))
    
    
#     FEAT1_TARGET_mapping_train = dict(zip(train_df.FEAT1, train_df.TARGET))
    
#     for row in range(len(major_df)):
#         data_row = major_df.iloc[row]
#         if data_row['Ratio'] >=ratio_threshold:
#             test_df.iloc[row]['Predicted_target'] = FEAT1_TARGET_mapping_train[data_row['FEAT1_train']]
            
    return major_df