In [None]:
# Code for BERT-Joint Leraning Model 
# to classify tweets as corresponding ad related
# accuracy = >86% 
# data : superbowl

In [None]:
#Confirm that GPU is detected
import tensorflow as tf
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import os

CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device.

In [None]:
import torch
# If there's a GPU available...
isGPUavailable = False
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    isGPUavailable = True
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |▍                               | 10kB 17.9MB/s eta 0:00:01[K     |▊                               | 20kB 3.6MB/s eta 0:00:01[K     |█▏                              | 30kB 3.9MB/s eta 0:00:01[K     |█▌                              | 40kB 4.4MB/s eta 0:00:01[K     |█▉                              | 51kB 4.7MB/s eta 0:00:01[K     |██▎                             | 61kB 4.9MB/s eta 0:00:01[K     |██▋                             | 71kB 5.2MB/s eta 0:00:01[K     |███                             | 81kB 5.3MB/s eta 0:00:01[K     |███▍                            | 92kB 5.3MB/s eta 0:00:01[K     |███▊                            | 102kB 5.3MB/s eta 0:00:01[K     |████                            | 112kB 5.3MB/s eta 0:00:01[K     |████▌                           | 122kB 5.3M

In [None]:
import pandas as pd

df_2 = pd.read_csv("./2_man_ann_sb.csv",  index_col= None)
df_2 = df_2.dropna(subset = ['tweet_text'])

df_1 = pd.read_csv("./mann_ann_sb.csv", index_col= None)

df_3 = pd.read_csv("./3_man_ann_sb_full_1.csv", index_col = None)
df_3 = df_3.dropna(subset = ['tweet_text'])

df_raw = df_1.append(df_2).append(df_3) # using batch 1 and batch 2 for training

print(df_raw.shape)


(7500, 19)


Data Cleaning

In [None]:
# Remove ads marked as below because they are not available in ad annotations file although tweets mention them
# commercials, joe biden, pizzahut, joe bieden, michael bloomberg, mike bloomberg, scientology...
ads_remove = ['commercials', 'joe biden', 'pizzahut', 'joe bieden', 'michael bloomberg', 'mike bloomberg', 
              'scientology','papa johns',  'bakari',  'secret', 'dashlane', 'bernie the peoples perfume',
              'ram trucks', 'golden gronks', "bush's best", 'ragged old flag', 'patience', 'guitar hero',
              'disney mulan']

# ads_rename = ['nfl100', 'tide']

# rename ads with spelling faults while manually adding the annotation 

df_raw['ad_manual_adjusted'] = df_raw['ad_manual_adjusted'].apply(lambda x: x.lower())
df_raw.loc[df_raw.ad_manual_adjusted == "discover card  no we don‚äôt charge annual fees", 
       "ad_manual_adjusted"] = "discover card  no we don’t charge annual fees"
df_raw.loc[df_raw.ad_manual_adjusted == "doritos the cool ranch", 
       "ad_manual_adjusted"] = "doritos  the cool ranch"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card yes we're accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card yes we’re accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card  yes we're accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "budweiser typical american", 
       "ad_manual_adjusted"] = "budweiser  typical american"
df_raw.loc[df_raw.ad_manual_adjusted == 'fox  halftime show  teaser_3',
            "ad_manual_adjusted"] = "fox  halftime show  teaser_1"
df_raw.loc[df_raw.ad_manual_adjusted == 'fox  halftime show  teaser_2',
            "ad_manual_adjusted"] = "fox  halftime show  teaser_1"
            
print(df_raw.shape)
df = pd.DataFrame()
removed_Data = pd.DataFrame()

for i,row  in df_raw.iterrows():
  if row['ad_manual_adjusted'] not in ads_remove:
    df = df.append(row)
  else:
    removed_Data = removed_Data.append(row)
print(df.shape)


(7500, 19)
(7394, 19)


In [None]:
classType = 'sent_exploded' # binary or multi-class or sent-exploded

In [None]:
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')
# stop.append('superbowl')
# stop.append('super') 
# stop.append('bowl')

# remove for multi class since almost all ads have these words
if classType is not 'binary':
  stop.append('commercial')
  stop.append('ad')
  stop.append('commercials')
  stop.append('ads')
print(len(stop))

def removeMentions(text):

    textBeforeMention = text.partition("@")[0]
    textAfterMention = text.partition("@")[2]
    textAfterMention =  re.sub(r':', '', textAfterMention) #cadillac join the 31k
    tHandle = textAfterMention.partition(" ")[0].lower() #cadillac    
    text = textBeforeMention+ " " + textAfterMention  
    return text

def cleanTweet(strinp):
    strinp = re.sub(r'RT', "", strinp) # Remove RT
    strinp = strinp.lower()
    
    stop_removed_list = [word for word in strinp.split() if word not in (stop)]
    stop_removed = ' '.join([str(elem) for elem in stop_removed_list])    
    text = re.sub('https?://[A-Za-z0-9./]+', ' ', stop_removed) # Remove URLs
    text = removeMentions(text)
    text = re.sub('[^\x00-\x7F]+', ' ', text) # Remove non-ASCII chars.
    
    # remove punctuations except '-'
    punctuation = ['(', ')', '[',']','?', ':', ':', ',', '.', '!', '/', '"', "'", '@', '#', '&', '-', '_']
    text = "".join((char for char in text if char not in punctuation))
    text = re.sub('[^a-zA-Z]', ' ', text) # remove all other than alphabet chars 

#     text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # remove all single characters     
    stop_removed_l = [word for word in text.split() if word not in (stop)]
    stop_removed = ' '.join([str(elem) for elem in stop_removed_l]) 
    return stop_removed

print(cleanTweet("RT @cadillacabc: Joinrt the 31K james_bond") )

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
183
cadillacabc joinrt k jamesbond


In [None]:
df['text_clean'] = df['tweet_text'].apply(lambda x: cleanTweet(x))
df['ad_manual_adjusted'] = df['ad_manual_adjusted'].apply(lambda x: x.lower())
df['ad_related'] = df['ad_manual_adjusted'].apply(lambda ad: 0 if ad == 'none' else 1)

comma_filter = ~df['ad_manual_adjusted'].str.contains(',')
df = df[comma_filter]

In [None]:
df_unique = df.drop_duplicates(subset = ['text_clean'])
df_with_dupes = df
df = df_unique

print(df_with_dupes.shape)
print(df.shape)

(7332, 21)
(5824, 21)


In [None]:
#tweets that are not manually annotated
remaining_tweets = pd.read_csv("./remaining_tweets_non_man.csv")
print(remaining_tweets.columns)

remaining_tweets['text_clean'] = remaining_tweets['tweet_text'].apply(lambda x: cleanTweet(x))
print(remaining_tweets.shape)

remaining_tweets = remaining_tweets.drop_duplicates(subset = ['text_clean'])
print(remaining_tweets.shape)

(108235, 10)
(52085, 10)


In [None]:
ad_product_df = pd.read_csv('./SB_ad_annotations_product_category_modified.csv')
ad_product_df = ad_product_df.rename(columns = {'Ad Name': 'Ad_Name'}) # rename the column to remove space
ad_product_df = ad_product_df.dropna() # because the file has trailing empty rows, remove them
# remove fox half time show teaser_2 ad because its keywords is same as fox half time show teaser_2
print(ad_product_df.shape)
ad_product_df.drop(ad_product_df[ad_product_df['Ad_Name'] == 'FOX  Halftime Show  Teaser_2'].index, inplace = True) 
print(ad_product_df.shape)
ad_product_dict = dict()

ad_product_df['Product_modified'] = ad_product_df['Product_modified'].apply(lambda x: x.lower())
for i, row in ad_product_df.iterrows():
  ad_product_dict[row['Ad_Name'].lower()] = row['Product_modified'].lower()

ad_product_dict['none'] = 'none'
print(ad_product_dict)

df['product_modified'] = df['ad_manual_adjusted'].apply(lambda ad: ad_product_dict[ad])
df['product_modified'] = df['product_modified'].apply(lambda x: x.lower())


(75, 11)
(74, 11)
{'fast & furious 9  trailer': 'movie trailer', 'quibi  bank heist': 'tech company', 'tide  when is later  masked singer': 'laundry detergent', 'fox  a run at history  daytona 500': 'sporting event', 'donald j. trump for president  criminal justice reform': 'political campaign', 'walmart  famous visitors': 'tech company', 'marvel  black widow trailer': 'movie trailer', 'rocket mortgage  home': 'money', 'porsche  the heist': 'car', 'snickers  fix the world': 'food', 'hulu  tom brady s big announcement': 'streaming service', 'fox  chosen  lego masters': 'tv show/network', 'mountain dew': 'pop/soda', 'squarespace  winona in winona': 'tech company', 'new york life  love takes action': 'money', 'fox  super monday': 'tv show/network', 'hyundai  smaht pahk': 'car', 'cheetos  can t touch this': 'food', 'olay  make space for women': 'charitable program', 'fox  halftime show  teaser_1': 'half-time show', 'michelob  6 for 6-pack': 'charitable program/ alcohol', 'avocados from mex

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# group the keywords of ads part of a product bucket and append these to the training sentences
ad_product_keywords_dict = ad_product_df.groupby('Product_modified')['Key Terms  Round 2'].agg(lambda x : x.sum() if x.dtype=='float64' else ' '.join(x))
# clean the ad key words - not removing duplicate words here - TODO
for ad_bucket in ad_product_keywords_dict.keys():
  ad_product_keywords_dict[ad_bucket] = cleanTweet(ad_product_keywords_dict[ad_bucket])
ad_product_df['product_modified_keywords'] = ad_product_df['Product_modified'].apply(lambda x: ad_product_keywords_dict[x])
ad_product_df.head(2)


Unnamed: 0,Ad Number,Brand Name,Ad_Name,Product_modified,Product,Key Terms Round 1,Key Terms Round 2,Excitatory Potential,Emotional vs. Rational,Semantic Affinity,Valence,product_modified_keywords
0,1.0,Trailer,Fast & Furious 9 Trailer,movie trailer,Movie Trailer,"fast and the furious, fast & the furious, fast...","fast and the furious, fast & the furious, ff9,...",1.0,1.0,2.0,1.0,fast furious fast furious ff f fast saga vin d...
1,2.0,Quibi,Quibi Bank Heist,tech company,Video Platform,"quibi, bank heist, robbery, less than ten minu...","quibi, bank heist, robbery, less than ten minu...",2.0,1.0,2.0,2.0,quibi bank heist robbery less ten minutes quic...


In [None]:
# generate adname, ad keywords dict to use in sent exploding
ad_name_keywords_dict = dict()
for i, row in ad_product_df.iterrows():
  ad_name_keywords_dict[row['Ad_Name'].lower()] = cleanTweet(row['Key Terms  Round 2'])
ad_name_keywords_dict['none'] = 'none'
print(ad_name_keywords_dict)

ad_keywords_name_dict = dict()
for ad_name in ad_name_keywords_dict:
  keywords_temp = ad_name_keywords_dict[ad_name]
  ad_keywords_name_dict[keywords_temp] = ad_name
print(ad_keywords_name_dict)

ad_product_df['ad_name_keywords'] = ad_product_df['Ad_Name'].apply(lambda x: ad_name_keywords_dict[x.lower()])

{'fast & furious 9  trailer': 'fast furious fast furious ff f fast saga vin diesel flying truck stunts michelle rodriguez fastfurious', 'quibi  bank heist': 'quibi bank heist robbery less ten minutes quick bites big stories chance rapper megan thee stallion chancetherappermeganstallion quickbites', 'tide  when is later  masked singer': 'tide laundry laundry detergent schitt emily hampshire charlie day walts', 'fox  a run at history  daytona 500': 'fox show daytona run history great american race', 'donald j. trump for president  criminal justice reform': 'donald trump trump change unemployment stronger safer prosperous trumpforpresident donaldtrump presidentrumpobama gop trumpsupporters', 'walmart  famous visitors': 'walmart pickup spaceship toy story buzz lightyear marvin martian marvin martians arrival glass cleaners aliens men black groot lego star wars r bill bill ted flash gordon', 'marvel  black widow trailer': 'marvel black widow scarlett johansson', 'rocket mortgage  home': 'qu

In [None]:
ad_product_df.to_csv('./ad_product_df.csv')

In [None]:
import pandas as pd

pd.set_option('display.max_rows', 100) 
ad_count = df.groupby('ad_manual_adjusted')['ad_manual_adjusted'].count()
ad_count.to_csv('./ad_count.csv')

In [None]:
# use column name 'ad_manual_adjusted' of df to 
def get_ad_related_twts(df, removeCommas = True):
  df['ad_manual_adjusted'] = df['ad_manual_adjusted'].apply(lambda x: x.lower())
  ad_filter = df['ad_manual_adjusted']!= 'none'
  ad_related_twts = df[ad_filter]
  if removeCommas:
    ad_filter_1 = ~ad_related_twts['ad_manual_adjusted'].str.contains(',')
    ad_related_twts = ad_related_twts[ad_filter_1]
  return ad_related_twts

def getAdTweets(ad_related_twts, ad):
  return ad_related_twts[ad_related_twts.ad_manual_adjusted == ad].shape[0]

def get_ad_id_dict(ad_related_twts): 
  n_ad_related = ad_related_twts.shape[0]
  print("# ad related tweets: "+ str(n_ad_related))
  ads_annotated = ad_related_twts.ad_manual_adjusted.values
  adset = set(ads_annotated)
  print("unique ads:"+ str(len(adset)))
  ad_id_dict = {}
  i = 0
  for ad in adset : 
    if(getAdTweets(ad_related_twts, ad) >=2):
      ad_id_dict[ad] = i
      i = i+1
    else:
      print('ad with <2 samples: '+ str(ad))
  print(" No of ads with >=2 samples:"+ str(len(ad_id_dict)))
  ad_id_dict['none'] = len(ad_id_dict)
  print(ad_id_dict)
  return ad_id_dict

def convertAdNameToAdId(ad_id_dict, ad_name):
  if ad_name in ad_id_dict:
    return ad_id_dict[ad_name]
  else:
    return ad_id_dict['none']

ad_related_twts = get_ad_related_twts(df)
ad_id_dict = get_ad_id_dict(get_ad_related_twts(df))
n_unique_ads = len(ad_id_dict) # ad_id_dict has none as well, so minus 1 when using embeddings
df['ad_manual_adjusted_id'] = df['ad_manual_adjusted'].apply(lambda x: convertAdNameToAdId(ad_id_dict,x))
print(n_unique_ads)


# ad related tweets: 2597
unique ads:63
ad with <2 samples: tide  ww
ad with <2 samples: fast & furious 9  trailer
ad with <2 samples: discover card  no we don’t charge annual fees
 No of ads with >=2 samples:60
{'snickers  fix the world': 0, 'amazon prime video  hunters': 1, 'pepsi zero sugar  zero sugar. done right.': 2, 'fox  super monday': 3, 'procter & gamble  when we come together': 4, 'genesis  going away party': 5, 't-mobile  mama tests 5g': 6, 'pop-tarts  pop-tarts fixed the pretzel commercial': 7, 'porsche  the heist': 8, 'no time to die  trailer': 9, 'tide  bud knight': 10, 'planters  baby funeral': 11, 'fox  a run at history  daytona 500': 12, 'tide  finally later': 13, 'michelob  6 for 6-pack': 14, 'toyota  heroes': 15, 'jeep  groundhog day [t1]': 16, 'google assistant  loretta': 17, 'heinz ketchup  find the goodness  four at once': 18, 'verizon  the amazing things 5g won t do': 19, 'turbotax  turbotax  all people are tax people remix': 20, 'weathertech  lucky dog': 21, 'k

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
classification_ad_product = False # set to True if classification labels are ad products instead of ad names

In [None]:
product_id_dict = dict()
products = df['product_modified'].unique()
i=0

for product in products:
  product_id_dict[product] = i
  i = i+1

n_unique_ad_produts = len(product_id_dict)

if classification_ad_product:
  n_unique_ads = n_unique_ad_produts

df['ad_product_id'] = df['product_modified'].apply(lambda x: product_id_dict[x])

print(n_unique_ads)

61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# train test split for multi class classification
from sklearn.model_selection import train_test_split

n = df.shape[0]
if classification_ad_product: # if classification labels are ad_products alter the labels accordingly using this variable
  sentences, test_sentences, labels, test_labels = train_test_split(df.text_clean.values, 
                  df.ad_product_id.values, 
                  #  random_state = 2018, 
                   test_size = 0.2, stratify = df.ad_product_id.values)
else:
  sentences, test_sentences, labels, test_labels = train_test_split(df.text_clean.values, 
                  df.ad_manual_adjusted_id.values, 
                  #  random_state = 2018, 
                   test_size = 0.2, stratify = df.ad_manual_adjusted_id.values)

train_size = len(sentences)
test_size = len(test_sentences)
print( "Train size: "+ str(train_size)+" test size:" + str(test_size))
                

Train size: 4659 test size:1165


In [None]:
print(len(test_labels))

1165


In [None]:
print(len(labels))

4659


In [None]:
def explode_data(sentences, labels, classification_ad_product):
  label_name_list = []
  label_id_list = []
  if classification_ad_product:
    label_name_list = list(product_id_dict.keys())
    label_id_list = list(product_id_dict.values())
  else:
    label_name_list = list(ad_id_dict.keys())
    label_id_list = list(ad_id_dict.values())

  sentences_exploded =[]
  labels_exploded = []
  # sent_labels_df = pd.DataFrame()
  for i in range(len(sentences)):
    curr_sent = sentences[i]
    curr_label = labels[i]
    curr_label_name = label_name_list[curr_label]

    if curr_label_name is not 'none' :
      curr_sent_exploded = []
      curr_label_exploded = []
      for j in range(len(label_name_list)): # expand for all products present
        if label_name_list[j] is not 'none':
          if classification_ad_product:
            label_keywords = ad_product_keywords_dict[label_name_list[j]]
          else:
            label_keywords = ad_name_keywords_dict[label_name_list[j]]
          curr_sent_exploded.append(curr_sent+". "+ label_keywords)
          
          if label_name_list[j] == curr_label_name :
            curr_label_exploded.append(1)
          else:
            curr_label_exploded.append(0)
      sentences_exploded.extend(curr_sent_exploded)
      labels_exploded.extend(curr_label_exploded)
    else:
      sentences_exploded.append(curr_sent)
      labels_exploded.append(2)
  return sentences_exploded, labels_exploded


In [None]:
# explode to only 0,1 classes
def explode_data_to_two_classes(sentences, labels, classification_ad_product):
  label_name_list = []
  label_id_list = []
  if classification_ad_product:
    label_name_list = list(product_id_dict.keys())
    label_id_list = list(product_id_dict.values())
  else:
    label_name_list = list(ad_id_dict.keys())
    label_id_list = list(ad_id_dict.values())

  sentences_exploded =[]
  labels_exploded = []
  # sent_labels_df = pd.DataFrame()
  for i in range(len(sentences)):
    curr_sent = sentences[i]
    curr_label = labels[i]
    curr_label_name = label_name_list[curr_label]

    curr_sent_exploded = []
    curr_label_exploded = []
    for j in range(len(label_name_list)): # expand for all products present
      if label_name_list[j] is not 'none':
        if classification_ad_product:
          label_keywords = ad_product_keywords_dict[label_name_list[j]]
        else:
          label_keywords = ad_name_keywords_dict[label_name_list[j]]
        curr_sent_exploded.append(curr_sent+". "+ label_keywords)
          
        if label_name_list[j] == curr_label_name :
          curr_label_exploded.append(1)
        else:
          curr_label_exploded.append(0)
    sentences_exploded.extend(curr_sent_exploded)
    labels_exploded.extend(curr_label_exploded)

  return sentences_exploded, labels_exploded


In [None]:
def explode_sentences_to_two_classes(sentences, classification_ad_product):
  label_name_list = []
  label_id_list = []
  if classification_ad_product:
    label_name_list = list(product_id_dict.keys())
    label_id_list = list(product_id_dict.values())
  else:
    label_name_list = list(ad_id_dict.keys())
    label_id_list = list(ad_id_dict.values())

  sentences_exploded =[]
  for i in range(len(sentences)):
    curr_sent = sentences[i]
    curr_sent_exploded = []
    for j in range(len(label_name_list)): # expand for all products present
      if label_name_list[j] is not 'none':
        if classification_ad_product:
          label_keywords = ad_product_keywords_dict[label_name_list[j]]
        else:
          label_keywords = ad_name_keywords_dict[label_name_list[j]]
        curr_sent_exploded.append(curr_sent+". "+ label_keywords)

    sentences_exploded.extend(curr_sent_exploded)

  return sentences_exploded

In [None]:
if classType == 'sent_exploded':
  sentences_exploded, labels_exploded = explode_data_to_two_classes(sentences, labels, classification_ad_product)
  print("size of train data after exploding: "+ str(len(sentences_exploded)))
  test_sentences_exploded, test_labels_exploded = explode_data_to_two_classes(test_sentences, test_labels, classification_ad_product)
  print("size of test data after exploding: "+ str(len(test_sentences_exploded)))


size of train data after exploding: 279540
size of test data after exploding: 69900


In [None]:
  rem_sentences_exploded = explode_sentences_to_two_classes(remaining_tweets.text_clean.values, classification_ad_product)
  print(len(rem_sentences_exploded))

3125100


In [None]:
# train test split for binary classification
import numpy as np
binary_sentences, binary_test_sentences, binary_labels, binary_test_labels = train_test_split(df.text_clean.values, df.ad_related.values, 
                  #  random_state = 2018, 
                   test_size=0.2)
def get_bin_from_multi_class(labels):
  binary_labels = []
  for ad_id in labels:
    if ad_id == ad_id_dict['none']:
      binary_labels.append(0)
    else:
      binary_labels.append(1)
  return binary_labels

# binary_sentences = sentences
# binary_test_sentences = test_sentences
# print(len(ad_id_dict))
# print(labels)
# binary_labels = get_bin_from_multi_class(labels)
# binary_test_labels = get_bin_from_multi_class(test_labels)

binary_train_size = len(binary_sentences)
binary_test_size = len(binary_test_sentences)
print("Total data set size: "+ str(df.shape[0])+", train size: "+ str(binary_train_size)+", test size: " + str(binary_test_size))

Total data set size: 5824, train size: 4659, test size: 1165


Transform our dataset into the format that BERT can be trained on

In [None]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('Loaded BERT tokenizer.')
vocab_tokens = list(tokenizer.vocab.keys())
print("Original Vocab size: " + str(len(vocab_tokens)))
 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Loaded BERT tokenizer.
Original Vocab size: 30522


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def get_lemmatize_words(plural_words):
  sing_words = []
  for word in plural_words:
    sing_words.append(wordnet_lemmatizer.lemmatize(word))
  return sing_words

words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
lem_words = get_lemmatize_words(words)
print(lem_words)

['apple', 'sheep', 'orange', 'cat', 'people', 'dice', 'penny', 'trump']


In [None]:
# Challenge: There are some words that are not separated although can be two legit words if separted
# lightyear, jasonmamoa, trumpsupporters, condimentfood, fixtheworld, budlightbudweiser, microsoftsurface, femalecoach, typicalamerican
# poptarts, quickbites, googleassistant, smahtpark, spacewalk, letitgo, bettahwaytopark, trumpforpresident, hardrockstadium,
# alpacinoprimevideo, femalecoach, inspirechange, motorshummer, newyorklife, stainlaundrydetergent

def get_missing_words_vocab(tokenizer, words):
  missing_words = []
  vocab = tokenizer.get_vocab()
  print("Actual word list of ad tokens:"+ str(len(words)))
  for word in words:
    if word not in vocab:
      missing_words.append(word)
  print(str(len(missing_words))+" missing_words")
  return missing_words

# missing_words = get_missing_words_vocab(tokenizer, keys_to_add)
# print(missing_words)

In [None]:
# Tokenize all of the sentences and map the tokens to their word IDs.
def get_input_ids(sentences):
  input_ids = []
  # For every sentence...
  for sent in sentences:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          # max_length = 128,          # Truncate all sentences.
                          # return_tensors = 'pt',     # Return pytorch tensors.
                    )

      input_ids.append(encoded_sent)
  return input_ids

if classType == 'binary':
  print('binary')
  input_ids = get_input_ids(binary_sentences)
elif classType == 'sent_exploded':
  print('sent_exploded')
  input_ids = get_input_ids(sentences_exploded)
else:
  print('multi class')
  input_ids = get_input_ids(sentences)

print(len(input_ids))


sent_exploded
279540


In [None]:
# binary_trained_embeddings = model.get_input_embeddings()
# print(binary_trained_embeddings)


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertModel
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
if classType == 'binary':
  num_labels = 2
elif classType == 'sent_exploded':
  num_labels = 2 # 0 - not same as the appended add, 1- same as the appended add,  (2 - none)
else:
  print(n_unique_ads)
  num_labels = n_unique_ads

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = num_labels,
    # num_labels = n_unique_ads, # The number of output labels=2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)
# model.resize_token_embeddings(len(tokenizer)) # since new tokens are added
# Tell pytorch to run this model on the GPU.
if isGPUavailable:
  model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# model.set_input_embeddings(binary_trained_embeddings)

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  88


In [None]:
#let’s choose MAX_LEN = 64 and apply the padding
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences
# Set the maximum sequence length.
# I've chosen MAXLEN somewhat arbitrarily. It's slightly larger than the max training sentence length
if classType == 'sent_exploded' and classification_ad_product:
  MAX_LEN = 300
elif classType == 'sent_exploded' and ~classification_ad_product:
  MAX_LEN = 100
else:
  MAX_LEN = 40
print("MAX_LEN:"+ str(MAX_LEN))
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('\nDone.')

MAX_LEN:100

Padding/truncating all sentences to 100 values...

Padding token: "[PAD]", ID: 0

Done.


The attention mask simply makes it explicit which tokens are actual words versus which are padding.
The BERT vocabulary does not use the ID 0, so if a token ID is 0, then it’s padding, and otherwise it’s a real token.

In [None]:
# Create attention masks
attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [None]:
#training and validation split

from sklearn.model_selection import train_test_split
# Use 90% for training and 10% for validation.

if classType == 'binary':
  input_labels = binary_labels
elif classType == 'sent_exploded':
  input_labels = labels_exploded
else:
  input_labels = labels

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, input_labels, 
                                                            # random_state=2018, 
                                                            test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_labels,
                                            #  random_state=2018, 
                                             test_size=0.1)

In [None]:
#Converting to PyTorch Data Types
# Convert all inputs and labels into torch tensors, the required datatype for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

let’s load BERT! There are a few different pre-trained BERT models available. “bert-base-uncased” means the version that has only lowercase letters (“uncased”) and is the smaller version of the two (“base” vs “large”)



In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

Optimizer & Learning Rate Scheduler:
Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.
For the purposes of fine-tuning, the authors recommend choosing from the following values:
Batch size: 16, 32 (We chose 32 when creating our DataLoaders).
Learning rate (Adam): 5e-5, 3e-5, 2e-5 (We’ll use 2e-5).
Number of epochs: 2, 3, 4 (We’ll use 4).
The epsilon parameter eps = 1e-8 is “a very small number to prevent any division by zero in the implementation”




In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
if classType == 'sent_exploded' and ~classification_ad_product:
  epochs = 2 # reducing the no. of epochs since train time is 30 min per epoch giving a train accuracy of 99 % and loss of 0.05 in each epoch
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Define a helper function to find accuracy

In [None]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

Helper function for formatting elapsed times.


In [None]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Kick off training

In [None]:
dont run this

import random
import numpy as np
import tensorflow as tf
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so we can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask = b_input_mask
                     ,labels=b_labels)
                  # )        
        # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
        loss = outputs[0]

        # # Accumulate the training loss over all of the batches so that we can
        # # calculate the average loss at the end. `loss` is a Tensor containing a
        # # single value; the `.item()` function just returns the Python value 
        # # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0. This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")

In [None]:
print(' No of labels:'+ str(len(label_ids)))
print(len(label_ids.flatten()))
print(label_ids)

In [None]:
import pickle

def save_model(model, tokenizer):
  trained_model = './model.pickle'
  tokenizer_model = './tokenizer.pickle'
  pickle.dump(model, open(trained_model, 'wb'))
  pickle.dump(tokenizer, open(tokenizer_model, 'wb') )

save_model(model, tokenizer)
print('model saved')

In [None]:
import pickle

model = pickle.load(open('./model_smax_0_1.pickle', 'rb'))
print('model loaded')

model loaded


In [None]:
#visualize training loss
# import plotly.express as px
# f = pd.DataFrame(loss_values)
# f.columns=['Loss']
# fig = px.line(f, x=f.index, y=f.Loss)
# fig.update_layout(title='Training loss of the Model',
#                    xaxis_title='Epoch',
#                    yaxis_title='Loss')
# fig.show()

Check for model's behaviour on ads that are not tarined for

i.e check if the model is able to classify an ad as ad related although these ads were not present in the training data

In [None]:
# binary_test_sentences = removed_Data.tweet_text.values
# removed_Data['ad_related'] = removed_Data['ad_manual_adjusted'].apply(lambda ad: 0 if ad == 'none' else 1)
# binary_test_labels = removed_Data.ad_related.values

Run the model on tweets that are not manually annotated
Use variable 'labelsPresent' to toggle between test data predictions and non manually annotated data predictions

In [None]:
#Uncomment this snippet when running multi class on binary classifier's output
# if classType == 'binary':
#   binary_test_sentences = input.tweet_text.values
# else:
#   binary_output =  pd.read_csv('./app_1_bin_class_no_man_ann.csv')
#   binary_output = binary_output[binary_output['ad_related']==1]
#   test_sentences = binary_output.tweet_text.values

# labels_present = False # for non manually annotated data
# print('labels not present')



32
labels not present


In [None]:
labels_present = True

print('labels present')

In [None]:
import pandas as pd

# Get the lists of sentences and their labels.

if classType == 'binary':
  sentences = binary_test_sentences
  labels = binary_test_labels
elif classType == 'sent_exploded':
  sentences = test_sentences_exploded
  labels = test_labels_exploded
else:
  print('multi class')
  sentences = test_sentences
  labels = test_labels

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids.append(encoded_sent)
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
if labels_present:
  prediction_labels = torch.tensor(labels)
# Set the batch size.  
batch_size = 32  
# Create the DataLoader.
if labels_present:
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
else:
  prediction_data = TensorDataset(prediction_inputs, prediction_masks)

prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# model_archive = model
# model = loaded_model

In [None]:
# Prediction on test set
print('Predicting labels for {:,} test sentences.'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  if labels_present:
    b_input_ids, b_input_mask, b_labels = batch
  else:
    b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask = b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  predictions.append(torch.nn.functional.softmax(logits, dim = 1).detach().cpu().numpy())
  
  #logits = logits.detach().cpu().numpy()
  # Store predictions and true labels
  # predictions.append(logits)
  
  if labels_present:
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    
print('DONE.')

In [None]:
def flatten_labels(true_labels):
  flat_labels =[]
  for i in range(len(true_labels)):
    flat_labels.extend(true_labels[i].flatten())
  return flat_labels

def calculate_pred_labels(predictions):
  tp_count =0
  pred_labels = []
  pred_scores = []
  # For each input batch...
  print(" len of predictions: " + str(len(predictions)))
  for i in range(len(predictions)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" and one column for "1"). 
    # Pick the label with the highest value and turn this in to a list of 0s and 1s.
    #print(predictions[i])
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    pred_scores_i = np.max(predictions[i], axis=1).flatten()
    pred_labels.extend(pred_labels_i)
    pred_scores.extend(pred_scores_i)
  return pred_labels, pred_scores

print(len(true_labels))
print(len(predictions))

pred_labels, pred_scores = calculate_pred_labels(predictions)
print(len(pred_labels))

if labels_present:
  true_labels_flat = flatten_labels(true_labels)
  print(len(true_labels_flat))

In [None]:
# input = pd.read_csv('./remaining_tweets_non_man.csv')
if labels_present:
  print('labels are present, do nothing')
else:
  if classType == 'binary':
    input['ad_related'] = pred_labels
    input.to_csv('./app_1_bin_class_no_man_ann.csv')
  elif classType == 'multi-class':
    binary_output['ad_id_predicted'] = pred_labels
    binary_output['ad_predicted'] = binary_output['ad_id_predicted'].apply(lambda id: list(ad_id_dict.keys())[id])
    binary_output.to_csv('./app_1_multi_class_no_man_ann.csv')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

def getMetrics(true_labels_flat, pred_labels, averageType):
  print("Evaluating metrics as per '"+averageType+"' average type")
  f1score = f1_score(true_labels_flat, pred_labels, average= averageType) 
  print('f1_score:'+ str(f1score))

  prec = precision_score(true_labels_flat, pred_labels, average=averageType)
  print('Precision:'+ str(prec))

  acc = accuracy_score(true_labels_flat, pred_labels)
  print("Accuracy: "+ str(acc))

  recall = recall_score(true_labels_flat, pred_labels, average=averageType)
  print("recall: "+ str(recall))

  # classification_report(true_labels_flat, pred_labels)
  confusionmatrix = confusion_matrix(true_labels_flat, pred_labels)
  print(confusionmatrix)
  if averageType == 'binary':
    tn, fp, fn, tp = confusionmatrix.ravel()
    print("tp: "+ str(tp)+" tn: "+ str(tn)+" fp: "+ str(fp)+" fn: "+ str(fn))

if classType == 'binary':
  averageType = 'binary'
elif classType == 'sent_exploded':
  averageType = 'weighted' # no weighting for imbalance
else:
  averageType = 'weighted'
  # micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
  # macro: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
  # weighted : Calculate metrics for each label, and find their average weighted by support
  #  (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; 
  # it can result in an F-score that is not between precision and recall.

getMetrics(true_labels_flat, pred_labels, averageType)

# //Accuracy of multi class by taking inputs of bin
# Evaluating metrics as per 'weighted' average type
# f1_score:0.6933475888363791
# Precision:0.6771610659170494
# Accuracy: 0.7354694485842027
# recall: 0.7354694485842027

In [None]:
sent_exploded_results_df = pd.DataFrame()
sent_exploded_results_df['sentence_raw'] = test_sentences_exploded
sent_exploded_results_df['label_pred'] = pred_labels
if labels_present:
  sent_exploded_results_df['label_true'] = true_labels_flat
sent_exploded_results_df['pred_scores'] = pred_scores
sent_exploded_results_df[['sentence','keywords_appended']] = sent_exploded_results_df.sentence_raw.str.split(".",expand=True) 
print(sent_exploded_results_df.head())

sent_exploded_results_df['keywords_appended_name'] = sent_exploded_results_df['keywords_appended'].apply(lambda x: 'none' if x is None else ad_keywords_name_dict[x.strip()])

sent_exploded_results_df.to_csv("./sent_exploded_results_df.csv")
print(sent_exploded_results_df.head())


In [None]:
def get_pred_label_name_score(df):
  sentence = df['sentence'].iloc[0:1] # since grouped by sentence there will be only one unique sentence
  max_pred_score = 0
  pred_label = 0
  pred_label_name = "none"

  true_label = 0
  true_label_name = "none"

  for i, row in df.iterrows():
    if row['label_pred'] == 1 and row['pred_scores']>max_pred_score:
      max_pred_score = row['pred_scores']
      pred_label = 1
      pred_label_name = row['keywords_appended_name']
    # if all 0  => not related to any ad => none
    # if all 2 => none

    # check for the true label as well 
    if row['label_true'] == 1 :
        true_label = 1
        true_label_name = row['keywords_appended_name']
    elif row['label_true'] == 2 :
        true_label = 2
        true_label_name = row['keywords_appended_name']      

  result = pd.DataFrame()
  result['sentence'] = sentence
  result['pred_label_name'] = pred_label_name
  result['max_pred_score'] = max_pred_score
  result['true_label'] = true_label
  result['true_label_name'] = true_label_name
  return result


In [None]:
def get_pred_2_label_name_score(df):
  sentence = df['sentence'].iloc[0:1] # since grouped by sentence there will be only one unique sentence
  max_pred_score = 0
  pred_label = 0
  pred_label_name = "none"

  true_label = 0
  true_label_name = "none"

  for i, row in df.iterrows():
    if row['label_pred'] == 1 and row['pred_scores']>max_pred_score:
      max_pred_score = row['pred_scores']
      pred_label = 1
      pred_label_name = row['keywords_appended_name']
    # if all 0  => not related to any ad => none
    # if all 2 => none

    # check for the true label as well 
    if row['label_true'] == 1 :
        true_label = 1
        true_label_name = row['keywords_appended_name']
   # default 0 which will be none     

  result = pd.DataFrame()
  result['sentence'] = sentence
  result['pred_label_name'] = pred_label_name
  result['max_pred_score'] = max_pred_score
  result['true_label'] = true_label
  result['true_label_name'] = true_label_name
  return result

In [None]:
sentence_results_df = pd.DataFrame()
sentence_results_df[['sentence_1', 'pred_label_name', 'max_pred_score', 'true_label', 'true_label_name']] = sent_exploded_results_df.groupby('sentence').apply(get_pred_2_label_name_score)

print(sentence_results_df.shape)
print(sentence_results_df.head())

In [None]:
# groups = sent_exploded_results_df.groupby('sentence')
# result = pd.DataFrame()
# for group in groups:
#   # print(group[1])
#   result = result.append(get_pred_label_name_score(group[1]))

# print(result.head())
# print(result.shape)
# result.to_csv('./sent_exp_grp_result_no_dupe.csv')

In [None]:
sentence_results_df.to_csv('./sent_exp_group_results.csv')

In [None]:
# import pandas as pd
# sentence_results_df = pd.read_csv('./sent_exp_group_results_smax_0_1.csv')
# sentence_results_df.head()

In [None]:
# use for sent_exploded classifier
from sklearn.metrics import classification_report
true_names = sentence_results_df['true_label_name']
pred_names = sentence_results_df['pred_label_name']
classification_report = classification_report(true_names, pred_names,output_dict=True)

classification_report_df = pd.DataFrame(classification_report).transpose()
print(classification_report_df.head())
if classType == 'sent_exploded':
  classification_report_df.to_csv('./classification_report_sent_explode.csv')

In [None]:
# use only for multi class classification
from sklearn.metrics import classification_report
print(ad_id_dict)

def convert_ad_ids_to_names(pred_ad_ids):
  if classification_ad_product:
    if classType == 'sent_exploded':
      ad_names_list_ordered = []
      ad_names_list_ordered.append(0)
      ad_names_list_ordered.append(1)
      ad_names_list_ordered.append(2)
    else:
      ad_names_list_ordered = list(product_id_dict.keys())
  else:
    ad_names_list_ordered = list(ad_id_dict.keys())
  # print(ad_names_list_ordered)
  pred_labels_names = len(pred_ad_ids)*[None]
  new_ad_id_dict = dict()
  pred_tweet_count = dict()
  for i in range(0, len(pred_ad_ids)):
    # print(pred_ad_ids[i])
    pred_labels_names[i] = ad_names_list_ordered[pred_ad_ids[i]]
    new_ad_id_dict[pred_ad_ids[i]] = ad_names_list_ordered[pred_ad_ids[i]]
    if ad_names_list_ordered[pred_ad_ids[i]] in pred_tweet_count:
      pred_tweet_count[ad_names_list_ordered[pred_ad_ids[i]]] = pred_tweet_count[ad_names_list_ordered[pred_ad_ids[i]]]+1
    else:
      pred_tweet_count[ad_names_list_ordered[pred_ad_ids[i]]] = 1
    # print(ad_names_list_ordered[pred_ad_ids[i]])
  new_ad_id_dict = sorted(new_ad_id_dict.items(), key=lambda x: x[1])
  print(new_ad_id_dict)
  print(pred_tweet_count)
  return new_ad_id_dict, pred_labels_names

new_ad_id_dict , true_labels_flat_names = convert_ad_ids_to_names(true_labels_flat)
_, pred_labels_names = convert_ad_ids_to_names(pred_labels)
print(" size of true label names:"+ str(len(set(true_labels_flat_names))))
classification_report = classification_report(true_labels_flat_names, pred_labels_names,target_names=new_ad_id_dict,output_dict=True)
# print(classification_report)

In [None]:
classification_report_df = pd.DataFrame(classification_report).transpose()
print(classification_report_df.head())
if classType is not 'sent_exploded':
  classification_report_df.to_csv('./classification_report_sent_explode.csv')


In [None]:
#plot confuson matrix
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix
import numpy as np

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize = 16, )
    plt.yticks(tick_marks, classes,fontsize = 16)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig('sent_expl_conf_matrix.png')


true_names = sentence_results_df['true_label_name']
pred_names = sentence_results_df['pred_label_name']
sorted_true_names = (list(set(true_names)))
print(sorted_true_names)

sorted_true_names = ['tide  bud knight', 'facebook  ready to rock?', 'hulu  tom brady s big announcement', 'donald j. trump for president  criminal justice reform', 'walmart  famous visitors',  'bud light seltzer  posty store  inside post s brain',  'planters  baby funeral', 'rocket mortgage  home','pop-tarts  pop-tarts fixed the pretzel commercial', 'olay  make space for women', 'doritos  the cool ranch']
cnf_matrix = confusion_matrix(true_names, pred_names, labels = sorted_true_names)

# Plot normalized confusion matrix
fig = plt.figure()
fig.set_size_inches(20, 20, forward=True)
#fig.align_labels()
plot_confusion_matrix(cnf_matrix, classes=np.asarray(sorted_true_names), normalize=True,
                      title='Normalized confusion matrix')

# ['hulu  tom brady s big announcement', 'genesis  going away party', 'tide  bud knight', 
#  'nfl  inspire change  anquan boldin', 'marvel  black widow trailer', 'budweiser  typical american', 
#  'hard rock hotels & casinos  bling cup', 'new york life  love takes action', 'none', 'planters  baby funeral', 
#  'mountain dew', 'disney+  it s time', 'bud light seltzer  posty store  inside post s brain', 
#  'turbotax  turbotax  all people are tax people remix', 
#  'avocados from mexico  the avocados from mexico shopping network', 'hyundai  smaht pahk', 
#  'little caesars pizza  best thing since sliced bread', 'cheetos  can t touch this', 
#  'nfl  building a better game', 'fox nation  breaking news', 'reese s  rock',
#  'michelob  jimmy works it out', 'michelob  6 for 6-pack', 'amazon prime video  hunters', 
#  'porsche  the heist', 'fox  halftime show  teaser_1', 'weathertech  lucky dog', 'jeep  groundhog day [t1]', 
#  'rocket mortgage  home', 'squarespace  winona in winona', 'audi  let it go [t1]', 'olay  make space for women', 
#  'doritos  the cool ranch', 'quibi  bank heist', 'pepsi zero sugar  zero sugar. done right.',
#  'hummer  gmc  quiet revolution', 'tide  finally later', 'donald j. trump for president  criminal justice reform',
#  'kia  tough never quits', 'pop-tarts  pop-tarts fixed the pretzel commercial', 'fox  not just another race', 
#  'discover card  yes we’re accepted', 'procter & gamble  when we come together', 'coca-cola energy  show up', 
#  'microsoft surface  be the one', 'no time to die  trailer', 'sabra  how do you  mmus?', 'facebook  ready to rock?', 
#  'amazon echo  before alexa', 'pringles  the infinite dimensions of rick and morty', 'fox  super monday', 
#  'google assistant  loretta', 't-mobile  mama tests 5g', 'walmart  famous visitors', 
#  'verizon  the amazing things 5g won t do', 'snickers  fix the world', 'fox  toads  the masked singer']


In [None]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import sklearn.metrics
# rows: true labels
# columns : pred labels
cm= pd.DataFrame(
    confusion_matrix(true_names,pred_names, labels = (list(set(true_names)))), 
                     index=(list(set(true_names))), columns=(list(set(true_names)))) 
print(cm)
cm.to_csv('./conf_matrix_df.csv')


In [None]:
end here

In [None]:
result = pd.DataFrame();
result['text_clean'] = test_sentences
result['true_label'] = true_labels_flat
result['true_label_name'] = result['true_label'].apply(lambda x: list(ad_id_dict.keys())[x])
result['predicted_label'] = pred_labels
result['pred_label_name'] = result['predicted_label'].apply(lambda x: list(ad_id_dict.keys())[x])
result.head()
result.to_csv('./result_multi_class.csv')

In [None]:
from collections import Counter
counter = dict(Counter(true_names))
print(counter)
test_ad_true_count = pd.DataFrame()
test_ad_true_count['ad'] = counter.keys()
test_ad_true_count['count'] = counter.values()
test_ad_true_count.to_csv('./test_true_ad_count.csv')

pred_counter = dict(Counter(pred_names))
print(pred_counter)
test_ad_pred_count = pd.DataFrame()
test_ad_pred_count['ad'] = pred_counter.keys()
test_ad_pred_count['count'] = pred_counter.values()
test_ad_pred_count.to_csv('./test_ad_pred_count.csv')

In [None]:
# def get_correct_samples(test_sentences, true_labels_flat, pred_labels, test_labels):
#   new_test_sentences = []
#   new_test_labels = []
#   for i in range(len(pred_labels)):
#     if true_labels_flat[i] == pred_labels[i]:
#       new_test_sentences.append(test_sentences[i])
#       new_test_labels.append (test_labels[i])
#   return new_test_sentences, new_test_labels

def get_ad_related_samples(test_sentences, true_labels_flat, pred_labels, multi_class_labels):
  new_test_sentences = []
  new_test_labels = []
  for i in range(len(pred_labels)):
    if pred_labels[i] == 1:
      new_test_sentences.append(test_sentences[i])
      new_test_labels.append(multi_class_labels[i])
  return new_test_sentences, new_test_labels
print(len(binary_test_sentences))
print(len(true_labels_flat))
print(len(pred_labels))
print(len(test_labels))
new_test_sentences, new_test_labels =get_ad_related_samples(binary_test_sentences, true_labels_flat, pred_labels, test_labels)
bin_predicted_ad_related = pd.DataFrame()
bin_predicted_ad_related['sentences'] = new_test_sentences
bin_predicted_ad_related['labels'] = new_test_labels
bin_predicted_ad_related.to_csv('./bin_ad_predicted.csv')
print(bin_predicted_ad_related['labels'] )
# print(len(new_test_sentences))
# print(len(new_test_labels))

In [None]:
bin_predicted_ad_related.head(5)

In [None]:
# count =0
# for i in range(len(new_test_sentences)):
#   if new_test_labels[i] != 69:
#     count = count+1
#     # print(new_test_sentences[i])
#     # print(str(new_test_labels[i])+":"+ list(ad_id_dict.keys())[new_test_labels[i]])
# print( count)

Save the model

In [None]:
 # save the model

import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


Sentence similarity between Tweets and ads using Fine tuned BERT embeddings

In [None]:

def get_word_indeces(tokenizer, text, word):
    '''
    Determines the index or indeces of the tokens corresponding to `word`
    within `text`. `word` can consist of multiple words, e.g., "cell biology".
    
    Determining the indeces is tricky because words can be broken into multiple
    tokens. I've solved this with a rather roundabout approach--I replace `word`
    with the correct number of `[MASK]` tokens, and then find these in the 
    tokenized result. 
    '''
    # Tokenize the 'word'--it may be broken into multiple tokens or subwords.
    word_tokens = tokenizer.tokenize(word)

    # Create a sequence of `[MASK]` tokens to put in place of `word`.
    masks_str = ' '.join(['[MASK]']*len(word_tokens))

    # Replace the word with mask tokens.
    text_masked = text.replace(word, masks_str)

    # `encode` performs multiple functions:
    #   1. Tokenizes the text
    #   2. Maps the tokens to their IDs
    #   3. Adds the special [CLS] and [SEP] tokens.
    input_ids = tokenizer.encode(text_masked)

    # Use numpy's `where` function to find all indeces of the [MASK] token.
    mask_token_indeces = np.where(np.array(input_ids) == tokenizer.mask_token_id)[0]

    return mask_token_indeces

In [None]:
# Check Sentence embeddings

def get_embedding(b_model, b_tokenizer, text, MAX_LEN = 64, word=''):
    '''
    Uses the provided model and tokenizer to produce an embedding for the
    provided `text`, and a "contextualized" embedding for `word`, if provided.
    '''
    # If a word is provided, figure out which tokens correspond to it.
    if not word == '':
        word_indeces = get_word_indeces(b_tokenizer, text, word)

    # Encode the text, adding the (required!) special tokens, and converting toPyTorch tensors.
    encoded_dict = b_tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        return_tensors = 'pt',     # Return pytorch tensors.
                )
    # print(encoded_dict)
    input_ids = encoded_dict['input_ids']
    # print(input_ids)
    b_model.eval()
    # bert_outputs = b_model(input_ids)     # Run the text through the model and get the hidden states.
    
    with torch.no_grad():     # Run the text through BERT, and collect all of the hidden states produced from all 12 layers. 

        outputs = b_model(input_ids)

        # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. 
        #In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers. 
        #See the documentation for more details:https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]     # `hidden_states` has shape [13 x 1 x <sentence length> x 768]
    # Select the embeddings from the second to last layer.
    # `token_vecs` is a tensor with shape [<sent length> x 768]
    # token_vecs = hidden_states[-2][0]
    
    token_vecs = hidden_states[-2][0]
    print(token_vecs.shape)
    # sentence_embedding = torch.mean(token_vecs, dim=0)    # Calculate the average of all token vectors.
    sentence_embedding = token_vecs[0]    # Calculate the average of all token vectors.
    sentence_embedding = sentence_embedding.detach().numpy()     # Convert to numpy array.

    if not word == '':     # If `word` was provided, compute an embedding for those tokens.
        word_embedding = torch.mean(token_vecs[word_indeces], dim=0)         # Take the average of the embeddings for the tokens in `word`
        word_embedding = word_embedding.detach().numpy()         # Convert to numpy array.
    
        return (sentence_embedding, word_embedding)
    else:
        return sentence_embedding

In [None]:

def get_sentence_embeddings(model, sentences, labels):
# Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in sentences:
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    )
      
      input_ids.append(encoded_sent)
  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                            dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(labels)
  # Set the batch size.  
  batch_size = 32  
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
  # Prediction on test set
  print('Predicting labels for {:,} test sentences.'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  sentence_embeddings = []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    hidden_states = outputs[1]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # print("# of hidden states "+ str(len(hidden_states)) )
    # print("# of input samples "+ str(len(hidden_states[-2])))
    sentence_token_vecs = hidden_states[0] + hidden_states[1] + hidden_states[2] + hidden_states[3]\
                        + hidden_states[4] + hidden_states[5] + hidden_states[6] + hidden_states[7]\
                        + hidden_states[8] + hidden_states[9] + hidden_states[10] + hidden_states[11]
    # sentence_token_vecs = hidden_states[-2]
    # print("token vecs' shape:" + str(sentence_token_vecs.shape)) # shape: [nSentences in a batch (32), sent length(64), 768] 
    for i in range(sentence_token_vecs.shape[0]):
      sentence_embedding = sentence_token_vecs[i][0]   # Get the embedding of Frst token ie CLS
      # sentence_embedding = torch.mean(sentence_token_vecs[i], dim=0)    # Calculate the average of all token vectors.
      sentence_embeddings.append(sentence_embedding)
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
  print('DONE.')
  print(" length of sentence embeddings" + str(len(sentence_embeddings)))

  return sentence_embeddings

Get max similar sentence from model based on sentence embeddings

In [None]:
# bert_model = BertModel.from_pretrained('bert-base-uncased',
#                                   output_hidden_states = True)
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# ad_keywords['ad_embeddings'] = ad_keywords['keywords_clean'].apply(lambda ad: 
#                                           get_embedding(bert_model, bert_tokenizer, ad))
# ad_related_twts['tweet_embeddings'] = ad_related_twts['text_clean'].apply(lambda x: 
#                                         get_embedding(bert_model, bert_tokenizer, x))

# sentences = ad_related_twts.text_clean.values
# labels = ad_related_twts.ad_manual_adjusted_id.values
# test_df = ad_related_twts

sentences = bin_predicted_ad_related['sentences']
labels = bin_predicted_ad_related['labels']
test_df = bin_predicted_ad_related

test_df['tweet_embeddings'] = get_sentence_embeddings(model, sentences, labels)

ad_sentences = ad_keywords['keywords_clean']
ad_labels = np.zeros(ad_sentences.shape[0])
ad_keywords['ad_embeddings'] = get_sentence_embeddings(model, ad_sentences, ad_labels)


In [None]:
ad_keywords['ad_embeddings'] 

In [None]:
# bin_predicted_ad_related['labels']

In [None]:
ad_1 = ad_keywords[ad_keywords['ad_id'] == 1]
print(ad_1)
keywords_1 = ad_1['keywords_clean']
print(keywords_1)
embedding_id_1 = ad_1['ad_embeddings']
print(embedding_id_1)


In [None]:
import numpy as np

def get_topn(arr, n):
  top_indices_unsorted = (np.argpartition(arr, -n)[-n:]) # gives indices of topn but unsorted
  arr = np.array(arr) # first convert input list to array
  indices = np.argsort(-arr[top_indices_unsorted])
  top_indices_sorted = top_indices_unsorted[indices]
  topn = arr[top_indices_sorted]

  return top_indices_sorted, topn

# arr = [3,2,4,5,6,9,8,7 ]
# topn_indices, topn_vals = get_topn(arr, 5)
# print(arr)
# print(topn_vals)
# print(topn_indices)


In [None]:
from scipy.spatial.distance import cosine
sent_sims = []
n = 5
max_sim_ad_id_1 = []
max_sim_ad_id_2 = []
max_sim_ad_id_3 = []
max_sim_ad_id_4 = []
max_sim_ad_id_5 = []

max_sim_ad_1 = []
max_sim_ad_2 = []
max_sim_ad_3 = []
max_sim_ad_4 = []
max_sim_ad_5 = []

ad_id_dict_keys = list(ad_id_dict.keys())

data = test_df

for sent_embedding in data['tweet_embeddings']:
  ad_sims = [None] * (len(ad_id_dict)-1) # -1 to exclude none 
  # for ad_embedding in ad_keywords['ad_embeddings']:
  for i,row in ad_keywords.iterrows():
    ad_embedding = row.ad_embeddings
    ad_id = row['ad_id']
    if ad_id < (len(ad_id_dict)-1): # if equals len(ad_id_dict) it is none i.e it does not have any keywords to find embeddings
      sim = 1-cosine(sent_embedding.to('cpu').numpy(), ad_embedding.to('cpu').numpy())
      ad_sims[ad_id] = sim
  sent_sims.append(ad_sims)
  topn_ads, topn_scores = get_topn(ad_sims, n)

  max_sim_ad_id_1.append(topn_ads[0])
  max_sim_ad_id_2.append(topn_ads[1])
  max_sim_ad_id_3.append(topn_ads[2])
  max_sim_ad_id_4.append(topn_ads[3])
  max_sim_ad_id_5.append(topn_ads[4])

  max_sim_ad_1.append(ad_id_dict_keys[topn_ads[0]])
  max_sim_ad_2.append(ad_id_dict_keys[topn_ads[1]])
  max_sim_ad_3.append(ad_id_dict_keys[topn_ads[2]])
  max_sim_ad_4.append(ad_id_dict_keys[topn_ads[3]])
  max_sim_ad_5.append(ad_id_dict_keys[topn_ads[4]])

data['max_sim_ad_1'] = max_sim_ad_1
data['max_sim_ad_2'] = max_sim_ad_2
data['max_sim_ad_3'] = max_sim_ad_3
data['max_sim_ad_4'] = max_sim_ad_4
data['max_sim_ad_5'] = max_sim_ad_5
data['max_sim_ad_id_1'] = max_sim_ad_id_1
data['max_sim_ad_id_2'] = max_sim_ad_id_2
data['max_sim_ad_id_3'] = max_sim_ad_id_3
data['max_sim_ad_id_4'] = max_sim_ad_id_4
data['max_sim_ad_id_5'] = max_sim_ad_id_5


In [None]:
# data.head(10)

In [None]:
# get max similar ad based on embeddings from model
count = 0
print(len(data))
for i, ad in data.iterrows():
  # if ad['ad_manual_adjusted_id'] == ad['max_sim_ad_id_1'] or ad['ad_manual_adjusted_id'] == ad['max_sim_ad_id_2'] or ad['ad_manual_adjusted_id'] == ad['max_sim_ad_id_3']:
  if ad['labels'] == ad['max_sim_ad_id_1'] or ad['labels'] == ad['max_sim_ad_id_2']\
  or ad['labels'] == ad['max_sim_ad_id_3']:
  # or ad['labels'] == ad['max_sim_ad_id_4']\
  # or ad['labels'] == ad['max_sim_ad_id_5'] :
    count = count + 1

accuracy = count/len(data)
print(count)
print(accuracy)

In [None]:
# By taking CLS token vector i.e vecs[0], calculate cosine similarity values and accuracy values
# -1: 0.032
# -2: 0.05
# -3: 0.012
# -4: 0.029
# -5: 0.061
# -6: 0.039
# -7: 0.014
# -8: 0.013
# -9: 0.006
# -10: 0.007
# -11: 0.019



# By taking average, calculate cosine similarity values and accuracy values
# -1: 0.016
# -2: 0.02
# -3: 0.007
# -4: 0.016
# -5: 0.023
# -6: 0.019
# -7: 0.010
# -8: 0.015
# -9: 0.023
# -10: 0.037
# -11: 0.046


In [None]:
# import inflect
# p = inflect.engine()

# def get_singular_words(plural_words):
#   sing_words = []
#   for word in plural_words:
#     sing_words.append(p.singular_noun(word))
#   return sing_words

# words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
# sing_words = get_singular_words(words)
# print(sing_words)

In [None]:
# from nltk.stem import PorterStemmer

# porter=PorterStemmer()
# def get_stem_words(plural_words):
#   sing_words = []
#   for word in plural_words:
#     sing_words.append(porter.stem(word))
#   return sing_words

# words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
# stem_words = get_stem_words(words)
# print(stem_words)

In [None]:
# https://huggingface.co/transformers/main_classes/tokenizer.html
# add words that are not in pre trained vocab as tokens
# TODO cross check if the embeddings actually got updated

# added_tokens = []
# for key in ad_keys_clean:
#   if key not in vocab_tokens:
#     n = tokenizer.add_tokens(key)
#     if n==1:
#       added_tokens.append(key)
# print('added '+ str(len(added_tokens)) +' to the vocab')
# print("added words:"+ str(added_tokens))