In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
# import seaborn as sns

from h import *


%matplotlib inline
sns.set_style("whitegrid")

df= pd.read_csv('/data/DS_INTERN/data/RAW_DATA/train_test_data.csv', low_memory=False)

df1 = add_target_column(df)

train_df , test_df = train_test_split(df1)

train_df = perform_preprocessing(train_df)
test_df = perform_preprocessing(test_df)

In [2]:
import random
import re

import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)

syn_df = pd.read_csv('synonyms_sample.csv', sep=';',header=None, names=['word','synonyms'])


def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    
    
    l = ((syn_df[syn_df['word']== word]['synonyms']))
    
    
    if l.shape[0] == 0:
        return []
    else :
        
        l= l.iloc[0]
        
        Syn_List = re.sub("[^\w]", " ",  l).split()

        if len(Syn_List):
            idx = random.choice(range(len(Syn_List)))
            return (Syn_List[idx])


@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word with a synonym."""
    words = [w for w in x['FEAT1'].split()]    
    idx = random.choice(range(len(words)))
  
    synonyms = get_synonyms(words[idx])
    
    if len(synonyms) > 0:
        x['FEAT1'] = " ".join(words[:idx] + [synonyms] + words[idx + 1 :])
    
        return x

In [3]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier


tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)



train_df_augmented = tf_applier.apply(train_df)

100%|██████████| 48197/48197 [01:51<00:00, 431.14it/s]


In [4]:
len(train_df_augmented), len(train_df)

(48407, 48197)

In [5]:
train_df_augmented.head()

Unnamed: 0,FEAT1,TARGET
8,BHF VANILLA CHAI LATTE VANILLA CHAI VANILLA CH...,BEVERAGES_BOLTHOUSE JUICE
9,ENERGY ACAI BERRY JUICE SAM ENERGY JUICE PREMI...,BEVERAGES_BOLTHOUSE JUICE
10,GENESIS TODAY POM BERRY POM BERRY GENESIS TODA...,BEVERAGES_BOLTHOUSE JUICE
11,PEPPERMINT MOCHA BHF PEPMINT MOCHA PEPPERMINT ...,BEVERAGES_BOLTHOUSE JUICE
12,SAMBAZON ACAI BERRY PACK SAM CAL ACAI JUICE PR...,BEVERAGES_BOLTHOUSE JUICE


In [6]:
X_train = train_df_augmented['FEAT1']
Y_train = train_df_augmented['TARGET']

X_test = test_df['FEAT1']

In [7]:
pipe = pipeline_voting_soft_3(100, kernel='linear')

In [8]:
pipe

VotingClassifier(estimators=[('RF',
                              Pipeline(steps=[('vect',
                                               TfidfVectorizer(min_df=10)),
                                              ('clf',
                                               RandomForestClassifier(class_weight='balanced',
                                                                      n_estimators=220))])),
                             ('LR',
                              Pipeline(steps=[('vect',
                                               TfidfVectorizer(min_df=10)),
                                              ('clf',
                                               LogisticRegression(C=250,
                                                                  class_weight='balanced',
                                                                  penalty='l1',
                                                                  solver='saga'))])),
                             ('SVM',
   

In [9]:
import time
start = time.time()

pipe.fit(X_train, Y_train)

print('time', time.time() - start, '\n\n')



time 602.0166957378387 




In [14]:
Y_pred = pipe.predict(X_test)

AttributeError: predict_proba is not available when  probability=False

In [12]:
import xgboost as xgb

OSError: /lib64/libm.so.6: version `GLIBC_2.23' not found (required by /base_env/py3-anaconda-base/lib/python3.6/site-packages/xgboost/libxgboost.so)

In [11]:
clf = xgb.XGBClassifier()

NameError: name 'xgb' is not defined

In [None]:
test_df = trust_factor(Y_pred, test_df)
test_df.describe()

In [14]:
test_df

Unnamed: 0,FEAT1,TARGET,category,variety,fuzzy_category,fuzzy_variety,TF_cat,TF_var,TF
0,TIE ESCAROLE NHM UPC FASTENER TWISTTIE RUBBERB...,,LETTUCE,ESCAROLE,43,100,0,1,1
1,ODWALLA BERRY MEGA JUICE BRY OMEGA ODW BERRY G...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
2,ODWALLA MONSTER NECTAR ODW MONSTER ODWALLA NEC...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
3,ODWALLA ORANGE JUICE JUICE ODW JUICE PREMIUM B...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
4,RAAW JUICE RASPBERRY LEMONGRASS RAAW RASP LEMN...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
5,BERRY WHEATGRASS JUICE JUICE VBW JUICE PREMIUM...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
6,ODWALLA CARROT JUICE CARROT ODW CARROT ODW CAR...,,BEVERAGE,BOLTHOUSE JUICE,100,100,1,1,1
7,MANGO GINGER CARROT OZMANGOGINGERCARR JUICE MA...,,BEVERAGE,OTHER BEVERAGE,100,100,1,1,1
14,EVOLUTION FRESH ORGANIC STRWBRY LEMONADE ORG J...,,BEVERAGE,BOLTHOUSE JUICE,100,100,1,1,1
15,JUICE ENERGY ODWALLA ODW JUICE ENERGY JUICE PR...,,BEVERAGE,BOLTHOUSE JUICE,100,100,1,1,1


In [15]:
pipe1 = pipeline_voting_hard(100)
pipe1.fit(X_train,Y_train)

Y_pred_1 = pipe1.predict(X_test)



In [16]:
test_df['Predicted_target'] = pd.Series(Y_pred_1, index= test_df.index)
test_df[['category1','variety1']] = test_df['Predicted_target'].str.split('_', expand=True)

In [19]:
test_df['category1'] = pd.Series(perform_spell_correction_manual(test_df['category1']), index=test_df.index)
test_df['variety1'] = pd.Series(perform_spell_correction_manual(test_df['variety1']), index=test_df.index)

test_df['category1'] = pd.Series(perform_spell_correction_walmart(test_df['category1']), index=test_df.index)
test_df['variety1'] = pd.Series(perform_spell_correction_walmart(test_df['variety1']), index=test_df.index)


test_df['category1'] = pd.Series(perform_lemmatization(test_df['category1']), index=test_df.index)
test_df['variety1'] = pd.Series(perform_lemmatization(test_df['variety1']), index=test_df.index)

In [22]:
test_df[test_df['variety'] != test_df['variety1']][['variety','variety1','TF_var']].describe()

Unnamed: 0,TF_var
count,108828.0
mean,0.554076
std,0.497069
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [229]:
pipe = pipeline_voting_hard(100)

In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate

start = time.time()

scores = cross_validate(pipe, X_train, Y_train, scoring= ('accuracy','precision_micro','recall_micro'), cv=5)

print("Accuracy : {:0.5f}".format(scores['test_accuracy'].mean()))
print("Precision_micro : {:0.5f}".format(scores['test_precision_micro'].mean()))
print("Recall_micro : {:0.5f}".format(scores['test_recall_micro'].mean()))


print('time', time.time() - start, '\n\n')

