In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
# import seaborn as sns

from c import *


%matplotlib inline
sns.set_style("whitegrid")

df= pd.read_csv('/data/DS_INTERN/data/RAW_DATA/train_test_data.csv', low_memory=False)

df1 = add_target_column(df)

train_df , test_df = train_test_split(df1)

train_df = perform_preprocessing(train_df)
test_df = perform_preprocessing(test_df)

In [2]:
import random
import snorkel
import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)


def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    lemmas = set().union(*[s.lemmas() for s in wn.synsets(word)])
    return list(set(l.name().lower().replace("_", " ") for l in lemmas) - {word})


@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word with a synonym."""
    words = [w.lower() for w in x['FEAT1'].split()]    
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x['FEAT1'] = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])
    
        x['FEAT1'] = ' '.join([w.upper() for w in x['FEAT1'].split()])
    
        return x

In [3]:
# s = train_df['FEAT1'].iloc[0]
# tf_replace_word_with_synonym(s)

In [4]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier


tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)


#tt = pd.DataFrame(train_df['FEAT1'])
train_df_augmented = tf_applier.apply(train_df)

100%|██████████| 48197/48197 [00:56<00:00, 847.74it/s]


In [6]:
len(train_df),len(train_df_augmented)

(48197, 129527)

In [7]:
train_df_augmented.head()

Unnamed: 0,FEAT1,TARGET
8,BHF VANILLA CHAI LATTE VANILLA CHAI VANILLA CH...,BEVERAGES_BOLTHOUSE JUICE
8,BHF VANILLA EXTRACT CHAI LATTE VANILLA CHAI VA...,BEVERAGES_BOLTHOUSE JUICE
9,ENERGY ACAI BERRY JUICE SAM ENERGY JUICE PREMI...,BEVERAGES_BOLTHOUSE JUICE
9,ENERGY ACAI BERRY JUICE SAM DEPARTMENT OF ENER...,BEVERAGES_BOLTHOUSE JUICE
9,ENERGY ACAI BERRY JUICE SAM ENERGY JUICE PREMI...,BEVERAGES_BOLTHOUSE JUICE


In [8]:
X_train = train_df_augmented['FEAT1']
Y_train = train_df_augmented['TARGET']

X_test = test_df['FEAT1']

In [9]:
pipe = pipeline_voting_hard(100)

In [10]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate

start = time.time()

scores = cross_validate(pipe, X_train, Y_train, scoring= ('accuracy','precision_micro','recall_micro'), cv=5)

print("Accuracy : {:0.5f}".format(scores['test_accuracy'].mean()))
print("Precision_micro : {:0.5f}".format(scores['test_precision_micro'].mean()))
print("Recall_micro : {:0.5f}".format(scores['test_recall_micro'].mean()))


print('time', time.time() - start, '\n\n')



Accuracy : 0.98362
Precision_micro : 0.98362
Recall_micro : 0.98362
time 6142.268483877182 




In [33]:
import time
start = time.time()

pipe.fit(X_train, Y_train)

print('time', time.time() - start, '\n\n')

time 1597.6241295337677 






In [34]:
Y_pred = pipe.predict(X_test)

In [35]:
test_df = trust_factor(Y_pred, test_df)
test_df.describe()

Unnamed: 0,fuzzy_category,fuzzy_variety,TF_cat,TF_var,TF
count,248208.0,248208.0,248208.0,248208.0,248208.0
mean,77.949389,85.373449,0.627309,0.7422,0.840839
std,29.519827,25.699583,0.483522,0.437424,0.365827
min,0.0,0.0,0.0,0.0,0.0
25%,43.0,78.0,0.0,0.0,1.0
50%,100.0,100.0,1.0,1.0,1.0
75%,100.0,100.0,1.0,1.0,1.0
max,100.0,100.0,1.0,1.0,1.0
