In [40]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
# import seaborn as sns

from e import *


%matplotlib inline
sns.set_style("whitegrid")

df= pd.read_csv('/data/DS_INTERN/data/RAW_DATA/train_test_data.csv', low_memory=False)

df1 = add_target_column(df)

train_df , test_df = train_test_split(df1)

train_df = perform_preprocessing(train_df)
test_df = perform_preprocessing(test_df)

In [41]:
import nltk
import random
import re
from nltk.corpus import wordnet as wn

nltk.download("wordnet")

from snorkel.augmentation import transformation_function

from snorkel.preprocess.nlp import SpacyPreprocessor
spacy = SpacyPreprocessor(text_field="FEAT1", doc_field="doc", memoize=False)


syn_df = pd.read_csv('synonyms_sample.csv', sep=';',header=None, names=['word','synonyms'])


def get_synonym(word, pos=None):
    """Get the synonyms of word from Wordnet."""
    
    l = ((syn_df[syn_df['word']== word]['synonyms']))
    
    if l.shape[0] == 0:
        return []
    else :
        
        l= l.iloc[0]
        
        Syn_List = re.sub("[^\w]", " ",  l).split()

        if len(Syn_List):
            idx = random.choice(range(len(Syn_List)))
            return (Syn_List[idx])

        
def replace_token(spacy_doc, idx, replacement):
    """Replace token in position idx with replacement."""
    p= " ".join([spacy_doc[:idx].text, replacement,spacy_doc[1 + idx :].text])

    return p


@transformation_function(pre=[spacy])
def replace_verb_with_synonym(x):
    # Get indices of verb tokens in sentence.
    
    verb_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "VERB"]
    
    if verb_idxs:
        # Pick random verb idx to replace.
        idx = np.random.choice(verb_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="v")
#        print(synonym,'verb')
        # If there's a valid verb synonym, replace it. Otherwise, return None.
        if synonym:
#            print(synonym.upper())
            x.FEAT1 = replace_token(x.doc, idx, synonym.upper())
            return x


@transformation_function(pre=[spacy])
def replace_noun_with_synonym(x):
    
    # Get indices of noun tokens in sentence.
    noun_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "NOUN"]
    if noun_idxs:
        # Pick random noun idx to replace.
        idx = np.random.choice(noun_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="n")
#         print(x.doc[idx].text)
#        print(synonym, 'noun')
        # If there's a valid noun synonym, replace it. Otherwise, return None.
        if synonym:
            x.FEAT1 = replace_token(x.doc, idx, synonym.upper())
            
            return x


@transformation_function(pre=[spacy])
def replace_adjective_with_synonym(x):
    # Get indices of adjective tokens in sentence.
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    if adjective_idxs:
        # Pick random adjective idx to replace.
        idx = np.random.choice(adjective_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="a")
#        print(synonym,' adjective')
        # If there's a valid adjective synonym, replace it. Otherwise, return None.
        if synonym:
            x.FEAT1 = replace_token(x.doc, idx, synonym.upper())
            return x

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:

# syn = wn.synsets('beverage')
# words = [lemma.name() for lemma in syn[0].lemmas()]
# words

In [43]:
from snorkel.augmentation import PandasTFApplier, MeanFieldPolicy

tfs = [
    replace_verb_with_synonym,
    replace_noun_with_synonym,
    replace_adjective_with_synonym,
]

mean_field_policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=2,
    n_per_original=2,
    keep_original=True,
    p=[0, 0, 1],
)


tf_applier = PandasTFApplier(tfs, mean_field_policy)

train_df_augmented = tf_applier.apply(train_df)

100%|██████████| 48197/48197 [22:11<00:00, 36.20it/s]


In [44]:
len(train_df_augmented), len(train_df)

(48197, 48197)

In [45]:
train_df_augmented.head()

Unnamed: 0,FEAT1,TARGET
8,BHF VANILLA CHAI LATTE VANILLA CHAI VANILLA CH...,BEVERAGES_BOLTHOUSE JUICE
9,ENERGY ACAI BERRY JUICE SAM ENERGY JUICE PREMI...,BEVERAGES_BOLTHOUSE JUICE
10,GENESIS TODAY POM BERRY POM BERRY GENESIS TODA...,BEVERAGES_BOLTHOUSE JUICE
11,PEPPERMINT MOCHA BHF PEPMINT MOCHA PEPPERMINT ...,BEVERAGES_BOLTHOUSE JUICE
12,SAMBAZON ACAI BERRY PACK SAM CAL ACAI JUICE PR...,BEVERAGES_BOLTHOUSE JUICE


In [46]:
X_train = train_df_augmented['FEAT1']
Y_train = train_df_augmented['TARGET']

X_test = test_df['FEAT1']

In [47]:
pipe = pipeline_voting_hard(100)

In [48]:
import time
start = time.time()

pipe.fit(X_train, Y_train)

print('time', time.time() - start, '\n\n')

time 482.179808139801 






In [49]:
Y_pred = pipe.predict(X_test)

In [50]:
test_df = trust_factor(Y_pred, test_df)
test_df.describe()

Unnamed: 0,fuzzy_category,fuzzy_variety,TF_cat,TF_var,TF
count,248208.0,248208.0,248208.0,248208.0,248208.0
mean,82.71422,88.905575,0.705622,0.799809,0.915204
std,27.466785,22.960907,0.455764,0.400144,0.278578
min,0.0,0.0,0.0,0.0,0.0
25%,50.0,100.0,0.0,1.0,1.0
50%,100.0,100.0,1.0,1.0,1.0
75%,100.0,100.0,1.0,1.0,1.0
max,100.0,100.0,1.0,1.0,1.0


In [304]:
pipe = pipeline_voting_hard(100)

In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate

start = time.time()

scores = cross_validate(pipe, X_train, Y_train, scoring= ('accuracy','precision_micro','recall_micro'), cv=5)

print("Accuracy : {:0.5f}".format(scores['test_accuracy'].mean()))
print("Precision_micro : {:0.5f}".format(scores['test_precision_micro'].mean()))
print("Recall_micro : {:0.5f}".format(scores['test_recall_micro'].mean()))


print('time', time.time() - start, '\n\n')



# Neural Networks

In [11]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

ModuleNotFoundError: No module named 'tensorflow_hub'

In [10]:
# Load BERT and the preprocessing model from TF Hub.
preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')


NameError: name 'hub' is not defined

In [39]:

from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=[11]),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(1),
])

ModuleNotFoundError: No module named 'tensorflow.keras'

In [15]:
import tensorflow as tf
tf.__version__

'1.3.0'

In [17]:
from tensorflow import keras

ImportError: cannot import name 'keras'