**Drive conncection**

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files


# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID
test_file_id = "1vUOhCmRnvy4gWx0qCGeHsh-WRg2VLt_Z"
test_file_name = "Food data.csv"
downloaded = drive.CreateFile({'id': test_file_id})
downloaded.GetContentFile(test_file_name)


# Download a file based on its file ID
testing_file_id = "15OMQOj6R_aJwktIOGHm0K_0U_ZWQ0lCP"
testing_file_name = "testdata.csv"
downloaded = drive.CreateFile({'id': testing_file_id})
downloaded.GetContentFile(testing_file_name)



# Ignore IPythonNoteook Warnings
import warnings
warnings.filterwarnings("ignore")

**Import Library**

In [0]:
import    pandas     as pd
import    numpy      as np
import    matplotlib as mp

import    tensorflow as tf
import    tensorflow_hub as hub


In [0]:
dataset = pd.read_csv(test_file_name, na_values='na', names = ["0", "1"])
dataset = dataset.dropna()

data = pd.read_csv(testing_file_name, na_values='na', names = ["0", "1"])
data = dataset.dropna()

cleanup_nums = { "1": {
                      "5": 1, 
                      "4": 1,
                      "3": 1,
                      "2": 0,
                      "1": 0,
                      "6": 1,
                      "5 star": 1,
                      "4 star": 1,
                      "3 star": 1,
                      "2 star": 0,
                      "1 star": 0,
                  }
               }

# replaceing accordin to the cleanup nums params
dataset.replace ( cleanup_nums, inplace=True )
data.replace ( cleanup_nums, inplace=True )


In [0]:
dataset["1"].value_counts()


reviews = dataset['0'].values
sentiments = dataset['1'].values

train_reviews = reviews[300:4500]
train_sentiments = sentiments[300:4500]



val_reviews = reviews[4500:]
val_sentiments = sentiments[4500:]


test_reviews = reviews[:300]
test_sentiments = sentiments[:300]

data_review = reviews[:300]
data_sentiment = reviews[:300]


train_reviews.shape, val_reviews.shape, test_reviews.shape, data_review.shape



((4200,), (947,), (300,), (300,))

**Preprocessing**

In [0]:
!pip install contractions
!pip install beautifulsoup4



In [0]:

import contractions
from bs4 import BeautifulSoup
import unicodedata
import re

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text):
    return contractions.fix(text)

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def pre_process_document(document):
    # strip HTML
    document = strip_html_tags(document)
    # lower case
    document = document.lower()
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    # remove accented characters
    document = remove_accented_chars(document)
    # expand contractions    
    document = expand_contractions(document)  
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [0]:
train_reviews = pre_process_corpus(train_reviews)
val_reviews = pre_process_corpus(val_reviews)
test_reviews = pre_process_corpus(test_reviews)

data_review = pre_process_corpus(data_review)


In [0]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': train_reviews}, train_sentiments, 
    batch_size=256, num_epochs=None, shuffle=True)
    
# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': train_reviews}, train_sentiments, shuffle=False)
    
# Prediction on the whole validation set.
predict_val_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': val_reviews}, val_sentiments, shuffle=False)
    
# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': test_reviews}, test_sentiments, shuffle=False)  


predict_test = tf.estimator.inputs.numpy_input_fn(
    {'sentence': data_review}, data_sentiment, shuffle=False) 

In [0]:
embedding_feature = hub.text_embedding_column(
    key='sentence', 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2",
    trainable=False)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
INFO:tensorflow:Downloading https://tfhub.dev/google/universal-sentence-encoder/2: 728.00MB
INFO:tensorflow:Downloaded https://tfhub.dev/google/universal-sentence-encoder/2, Total size: 993.27MB
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.


In [0]:
dnn = tf.estimator.DNNClassifier(
          hidden_units=[64, 32],
          feature_columns=[embedding_feature],
          n_classes=2,
          activation_fn=tf.nn.relu,
          dropout=0.1,
          optimizer=tf.train.AdagradOptimizer(learning_rate=0.005))
          


In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)
import time

TOTAL_STEPS = 1500
STEP_SIZE = 100
for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*100)
    print('Training for step =', step)
    start_time = time.time()
    dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
    elapsed_time = time.time() - start_time
    print('Train Time (s):', elapsed_time)
    print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
    print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))

In [0]:
print('Eval Metrics (test):', dnn.evaluate(input_fn=predict_test_input_fn))

Eval Metrics (Validation): {'accuracy': 0.8791946, 'accuracy_baseline': 0.75838923, 'auc': 0.9433383, 'auc_precision_recall': 0.9813519, 'average_loss': 0.25799492, 'label/mean': 0.75838923, 'loss': 28.830933, 'precision': 0.9059829, 'prediction/mean': 0.7616225, 'recall': 0.9380531, 'global_step': 1600}


# Function create to test multiple embeddings
Google universal
NNN-128


In [0]:
import time

TOTAL_STEPS = 1500
STEP_SIZE = 500

my_checkpointing_config = tf.estimator.RunConfig(
    keep_checkpoint_max = 2,       # Retain the 2 most recent checkpoints.
)

def train_and_evaluate_with_sentence_encoder(hub_module, train_module=False, path=''):
   
    embedding_feature = hub.text_embedding_column(
        key='sentence', module_spec=hub_module, trainable=train_module)
  
    print()
    print('='*100)
    print('Training with', hub_module)
    print('Trainable is:', train_module)
    print('='*100)
  
    dnn = tf.estimator.DNNClassifier(
            hidden_units=[64, 32],
            feature_columns=[embedding_feature],
            n_classes=2,
            activation_fn=tf.nn.relu,
            dropout=0.1,
            optimizer=tf.train.AdagradOptimizer(learning_rate=0.005),
            config=my_checkpointing_config)
    
    for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
        print('-'*100)
        print('Training for step =', step)
        start_time = time.time()
        dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
        elapsed_time = time.time() - start_time
        print('Train Time (s):', elapsed_time)
        print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
        print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))

    train_eval_result = dnn.evaluate(input_fn=predict_train_input_fn)
    test_eval_result = dnn.evaluate(input_fn=predict_test_input_fn)

    return {
      "Model Dir": dnn.model_dir,
      "Training Accuracy": train_eval_result["accuracy"],
      "Test Accuracy": test_eval_result["accuracy"],
      "Training AUC": train_eval_result["auc"],
      "Test AUC": test_eval_result["auc"],
      "Training Precision": train_eval_result["precision"],
      "Test Precision": test_eval_result["precision"],
      "Training Recall": train_eval_result["recall"],
      "Test Recall": test_eval_result["recall"]
    }

In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)

results = {}

results["nnlm-en-dim128"] = train_and_evaluate_with_sentence_encoder(
    "https://tfhub.dev/google/nnlm-en-dim128/1", path='/storage/models/nnlm-en-dim128_f/')

# results["nnlm-en-dim128-with-training"] = train_and_evaluate_with_sentence_encoder(
#     "https://tfhub.dev/google/nnlm-en-dim128/1", train_module=True, path='/storage/models/nnlm-en-dim128_t/')




Training with https://tfhub.dev/google/nnlm-en-dim128/1
Trainable is: False
----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 12.616347789764404
Eval Metrics (Train): {'accuracy': 0.8852381, 'accuracy_baseline': 0.75357145, 'auc': 0.9432105, 'auc_precision_recall': 0.98066676, 'average_loss': 0.2587264, 'label/mean': 0.75357145, 'loss': 32.928814, 'precision': 0.90737927, 'prediction/mean': 0.75315577, 'recall': 0.9440758, 'global_step': 500}
Eval Metrics (Validation): {'accuracy': 0.875396, 'accuracy_baseline': 0.77613515, 'auc': 0.91785073, 'auc_precision_recall': 0.97124016, 'average_loss': 0.29406333, 'label/mean': 0.77613515, 'loss': 34.809746, 'precision': 0.90538764, 'prediction/mean': 0.7625944, 'recall': 0.93741494, 'global_step': 500}
----------------------------------------------------------------------------------------------------
Training for step = 500
Train Time (s): 11.70775985717

In [0]:
results["use-512"] = train_and_evaluate_with_sentence_encoder(
    "https://tfhub.dev/google/universal-sentence-encoder/2", path='/storage/models/use-512_f/')

# results["use-512-with-training"] = train_and_evaluate_with_sentence_encoder(
#     "https://tfhub.dev/google/universal-sentence-encoder/2", train_module=True, path='/storage/models/use-512_t/')



Training with https://tfhub.dev/google/universal-sentence-encoder/2
Trainable is: False
----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 75.90551781654358
Eval Metrics (Train): {'accuracy': 0.89428574, 'accuracy_baseline': 0.75357145, 'auc': 0.9552694, 'auc_precision_recall': 0.98513526, 'average_loss': 0.22907642, 'label/mean': 0.75357145, 'loss': 29.155182, 'precision': 0.91189826, 'prediction/mean': 0.75840956, 'recall': 0.9516588, 'global_step': 500}
Eval Metrics (Validation): {'accuracy': 0.87328404, 'accuracy_baseline': 0.77613515, 'auc': 0.9278238, 'auc_precision_recall': 0.9779785, 'average_loss': 0.2722735, 'label/mean': 0.77613515, 'loss': 32.230377, 'precision': 0.8967742, 'prediction/mean': 0.7777679, 'recall': 0.9455782, 'global_step': 500}
----------------------------------------------------------------------------------------------------
Training for step = 500
Train Time (s): 66.4

In [0]:
results

{'nnlm-en-dim128': {'Model Dir': '/tmp/tmpr1enc05w',
  'Test AUC': 0.9460052,
  'Test Accuracy': 0.86333334,
  'Test Precision': 0.8826087,
  'Test Recall': 0.9354839,
  'Training AUC': 0.9616366,
  'Training Accuracy': 0.9104762,
  'Training Precision': 0.92450535,
  'Training Recall': 0.95955765},
 'use-512': {'Model Dir': '/tmp/tmpg_0m3nsv',
  'Test AUC': 0.9491144,
  'Test Accuracy': 0.87666667,
  'Test Precision': 0.90178573,
  'Test Recall': 0.9308756,
  'Training AUC': 0.9764583,
  'Training Accuracy': 0.9285714,
  'Training Precision': 0.93607306,
  'Training Recall': 0.971564}}

**ULMFIT implementation**

In [0]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai
!pip install numpy==1.15.0

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html


In [0]:
# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [0]:
documents = dataset
df = pd.DataFrame({'label':dataset["1"], 'text':dataset["0"]})
df.shape


(5447, 2)

In [0]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: ignored

In [0]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

In [0]:
df_trn.shape, df_val.shape


((3268, 2), (2179, 2))

In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [0]:
learn = language_model_learner(data_lm, AWD_LSTM , drop_mult=0.1)


learn.save_encoder('ft_enc')

learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.1)
learn.load_encoder('ft_enc')

learn.fit_one_cycle(1, 5e-2)


epoch,train_loss,valid_loss,accuracy
1,0.455331,0.394635,0.826985


In [0]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1
1,531,1646
