In [1]:
#Confirm that GPU is detected
import tensorflow as tf
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import os

CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device.

In [3]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [4]:
!pip install transformers



In [5]:
import pandas as pd

df_2 = pd.read_csv("./2_man_ann_sb.csv",  index_col= None)
df_2 = df_2.dropna(subset = ['tweet_text'])

df_1 = pd.read_csv("./mann_ann_sb.csv", index_col= None)

df_3 = pd.read_csv("./3_man_ann_sb_full_1.csv", index_col = None)
df_3 = df_3.dropna(subset = ['tweet_text'])

df_raw = df_1.append(df_2).append(df_3) # using batch 1 and batch 2 for training

print(df_raw.shape)

(7500, 19)


In [6]:
input = pd.read_csv("./remaining_tweets_non_man.csv")
print(input.columns)

Index(['Unnamed: 0', 'user_id', 'tweet_id', 'tweet_text', 'ad_mentioned',
       'time_of_tweet', 'user_location', 'team followed', 'affective_state'],
      dtype='object')


In [7]:
# Remove ads marked as below because they are not available in ad annotations file although tweets mention them
# commercials, joe biden, pizzahut, joe bieden, michael bloomberg, mike bloomberg, scientology...
ads_remove = ['commercials', 'joe biden', 'pizzahut', 'joe bieden', 'michael bloomberg', 'mike bloomberg', 
              'scientology','papa johns',  'bakari',  'secret', 'dashlane', 'bernie the peoples perfume',
              'ram trucks', 'golden gronks', "bush's best", 'ragged old flag', 'patience', 'guitar hero',
              'disney mulan']

# ads_rename = ['nfl100', 'tide']

# rename ads with spelling faults while manually adding the annotation 

df_raw['ad_manual_adjusted'] = df_raw['ad_manual_adjusted'].apply(lambda x: x.lower())
df_raw.loc[df_raw.ad_manual_adjusted == "discover card  no we don‚äôt charge annual fees", 
       "ad_manual_adjusted"] = "discover card  no we don’t charge annual fees"
df_raw.loc[df_raw.ad_manual_adjusted == "doritos the cool ranch", 
       "ad_manual_adjusted"] = "doritos  the cool ranch"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card yes we're accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card yes we’re accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "discover card  yes we're accepted", 
       "ad_manual_adjusted"] =  "discover card  yes we’re accepted"
df_raw.loc[df_raw.ad_manual_adjusted == "budweiser typical american", 
       "ad_manual_adjusted"] = "budweiser  typical american"
df_raw.loc[df_raw.ad_manual_adjusted == 'fox  halftime show  teaser_3',
            "ad_manual_adjusted"] = "fox  halftime show  teaser_1"
df_raw.loc[df_raw.ad_manual_adjusted == 'fox  halftime show  teaser_2',
            "ad_manual_adjusted"] = "fox  halftime show  teaser_1"
            
print(df_raw.shape)
df = pd.DataFrame()
removed_Data = pd.DataFrame()

for i,row  in df_raw.iterrows():
  if row['ad_manual_adjusted'] not in ads_remove:
    df = df.append(row)
  else:
    removed_Data = removed_Data.append(row)
print(df.shape)

(7500, 19)
(7394, 19)


In [8]:
classType = 'multi-class'

In [9]:
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')
# stop.append('superbowl')
# stop.append('super')
# stop.append('bowl')
# remove for multi class since almost all ads have these words
if classType is not 'binary':
  stop.append('commercial')
  stop.append('ad')
  stop.append('commercials')
  stop.append('ads')
print(len(stop))

def removeMentions(text):

    textBeforeMention = text.partition("@")[0]
    textAfterMention = text.partition("@")[2]
    textAfterMention =  re.sub(r':', '', textAfterMention) #cadillac join the 31k
    tHandle = textAfterMention.partition(" ")[0].lower() #cadillac    
    text = textBeforeMention+ " " + textAfterMention  
    return text

def cleanTweet(strinp):
    strinp = re.sub(r'RT', "", strinp) # Remove RT
    strinp = strinp.lower()
    
    stop_removed_list = [word for word in strinp.split() if word not in (stop)]
    stop_removed = ' '.join([str(elem) for elem in stop_removed_list])    
    text = re.sub('https?://[A-Za-z0-9./]+', ' ', stop_removed) # Remove URLs
    text = removeMentions(text)
    text = re.sub('[^\x00-\x7F]+', ' ', text) # Remove non-ASCII chars.
    
    # remove punctuations except '-'
    punctuation = ['(', ')', '[',']','?', ':', ':', ',', '.', '!', '/', '"', "'", '@', '#', '&', '-', '_']
    text = "".join((char for char in text if char not in punctuation))
    text = re.sub('[^a-zA-Z]', ' ', text) # remove all other than alphabet chars 

#     text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # remove all single characters     
    stop_removed_l = [word for word in text.split() if word not in (stop)]
    stop_removed = ' '.join([str(elem) for elem in stop_removed_l]) 
    return stop_removed

print(cleanTweet("RT @cadillacabc: Joinrt the 31K james_bond") )

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
183
cadillacabc joinrt k jamesbond


In [10]:
df['text_clean'] = df['tweet_text'].apply(lambda x: cleanTweet(x))
df['ad_manual_adjusted'] = df['ad_manual_adjusted'].apply(lambda x: x.lower())
df['ad_related'] = df['ad_manual_adjusted'].apply(lambda ad: 0 if ad == 'none' else 1)

comma_filter = ~df['ad_manual_adjusted'].str.contains(',')
df = df[comma_filter]

In [11]:
df_unique = df.drop_duplicates(subset = ['text_clean'])
df_with_dupes = df
df = df_unique

print(df_with_dupes.shape)
print(df.shape)

(7332, 21)
(5824, 21)


In [12]:
input['text_clean'] = input['tweet_text'].apply(lambda x: cleanTweet(x))


In [13]:
# use column name 'ad_manual_adjusted' of df to 
def get_ad_related_twts(df, removeCommas = True):
  df['ad_manual_adjusted'] = df['ad_manual_adjusted'].apply(lambda x: x.lower())
  ad_filter = df['ad_manual_adjusted']!= 'none'
  ad_related_twts = df[ad_filter]
  if removeCommas:
    ad_filter_1 = ~ad_related_twts['ad_manual_adjusted'].str.contains(',')
    ad_related_twts = ad_related_twts[ad_filter_1]
  return ad_related_twts

def getAdTweets(ad_related_twts, ad):
  return ad_related_twts[ad_related_twts.ad_manual_adjusted == ad].shape[0]

def get_ad_id_dict(ad_related_twts): 
  n_ad_related = ad_related_twts.shape[0]
  print("# ad related tweets: "+ str(n_ad_related))
  ads_annotated = ad_related_twts.ad_manual_adjusted.values
  adset = set(ads_annotated)
  print("unique ads:"+ str(len(adset)))
  ad_id_dict = {}
  i = 0
  for ad in adset : 
    if(getAdTweets(ad_related_twts, ad) >=2):
      ad_id_dict[ad] = i
      i = i+1
    else:
      print('ad with <2 samples: '+ str(ad))
  print(" No of ads with >=2 samples:"+ str(len(ad_id_dict)))
  ad_id_dict['none'] = len(ad_id_dict)
  print(ad_id_dict)
  return ad_id_dict

def convertAdNameToAdId(ad_id_dict, ad_name):
  if ad_name in ad_id_dict:
    return ad_id_dict[ad_name]
  else:
    return ad_id_dict['none']

ad_related_twts = get_ad_related_twts(df)
ad_id_dict = get_ad_id_dict(get_ad_related_twts(df))
n_unique_ads = len(ad_id_dict) # ad_id_dict has none as well, so minus 1 when using embeddings
df['ad_manual_adjusted_id'] = df['ad_manual_adjusted'].apply(lambda x: convertAdNameToAdId(ad_id_dict,x))
# ad_related_twts['ad_manual_adjusted_id'] = ad_related_twts['ad_manual_adjusted'].apply(lambda x: convertAdNameToAdId(ad_id_dict,x))

# ad related tweets: 2597
unique ads:63
ad with <2 samples: fast & furious 9  trailer
ad with <2 samples: discover card  no we don’t charge annual fees
ad with <2 samples: tide  ww
 No of ads with >=2 samples:60
{'tide  finally later': 0, 'reese s  rock': 1, 'donald j. trump for president  criminal justice reform': 2, 'nfl  inspire change  anquan boldin': 3, 'doritos  the cool ranch': 4, 'marvel  black widow trailer': 5, 'procter & gamble  when we come together': 6, 'porsche  the heist': 7, 'cheetos  can t touch this': 8, 'pepsi zero sugar  zero sugar. done right.': 9, 'fox  super monday': 10, 'jeep  groundhog day [t1]': 11, 'disney+  it s time': 12, 'google assistant  loretta': 13, 'walmart  famous visitors': 14, 'fox  great american race': 15, 'no time to die  trailer': 16, 'turbotax  turbotax  all people are tax people remix': 17, 'rocket mortgage  home': 18, 'bud light seltzer  posty store  inside post s brain': 19, 'new york life  love takes action': 20, 'fox  a run at history  da

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
# train test split for multi class classification
from sklearn.model_selection import train_test_split

n = df.shape[0]
sentences, test_sentences, labels, test_labels = train_test_split(df.text_clean.values, df.ad_manual_adjusted_id.values, 
                  #  random_state = 2018, 
                   test_size = 0.2, stratify= df.ad_manual_adjusted_id.values)

# ad_related_twts = df[df['ad_manual_adjusted_id']<n_unique_ads]
# n_ad_related = ad_related_twts.shape[0]
# print("Total data set: "+ str(n_ad_related))
# sentences, _, labels, _ = train_test_split(ad_related_twts.text_clean.values, ad_related_twts.ad_manual_adjusted_id.values, 
#                    random_state = 2018, 
#                    test_size = 0.2, stratify= ad_related_twts.ad_manual_adjusted_id.values)
train_size = len(sentences)
test_size = len(test_sentences)
print( "Train size: "+ str(train_size)+" test size:" + str(test_size))

Train size: 4659 test size:1165


In [15]:
# train test split for binary classification
import numpy as np
binary_sentences, binary_test_sentences, binary_labels, binary_test_labels = train_test_split(df.text_clean.values, df.ad_related.values, 
                  #  random_state = 2018, 
                   test_size=0.2)
def get_bin_from_multi_class(labels):
  binary_labels = []
  for ad_id in labels:
    if ad_id == ad_id_dict['none']:
      binary_labels.append(0)
    else:
      binary_labels.append(1)
  return binary_labels

# binary_sentences = sentences
# binary_test_sentences = test_sentences
# print(len(ad_id_dict))
# print(labels)
# binary_labels = get_bin_from_multi_class(labels)
# binary_test_labels = get_bin_from_multi_class(test_labels)

binary_train_size = len(binary_sentences)
binary_test_size = len(binary_test_sentences)
print("Total data set size: "+ str(df.shape[0])+", train size: "+ str(binary_train_size)+", test size: " + str(binary_test_size))

Total data set size: 5824, train size: 4659, test size: 1165


In [16]:
train_data = pd.DataFrame()
test_data = pd.DataFrame()
if classType == 'binary':
  train_data['sentences'] = binary_sentences
  train_data['labels'] = binary_labels
  test_data['sentences'] = binary_test_sentences
  test_data['labels'] = binary_test_labels
else:
  train_data['sentences'] = sentences
  train_data['labels'] = labels
  test_data['sentences'] = test_sentences
  test_data['labels'] = test_labels

train_data.to_csv('./app4_train_data.csv')
test_data.to_csv('./app4_test_data.csv')

reqd_ads = {'marvel  black widow trailer':0 ,'rocket mortgage  home':0 , 
            'olay  make space for women':0 , 'hard rock hotels & casinos  bling cup':0 ,
            "homeland trailer":0 , "microsoft surface  be the one":0}

for i,row in train_data.iterrows():
    ad = row['labels']
    if ad in reqd_ads:
        reqd_ads[ad] = reqd_ads[ad]+1
print('train data size:'+ str(train_data.shape[0]))

print(reqd_ads)

reqd_ads = {'marvel  black widow trailer':0 ,'rocket mortgage  home':0 , 
            'olay  make space for women':0 , 'hard rock hotels & casinos  bling cup':0 ,
            "homeland trailer":0 , "microsoft surface  be the one":0}

for i,row in test_data.iterrows():
    ad = row['labels']
    if ad in reqd_ads:
        reqd_ads[ad] = reqd_ads[ad]+1
print('test data size:'+ str(test_data.shape[0]))

print(reqd_ads)

train data size:4659
{'marvel  black widow trailer': 0, 'rocket mortgage  home': 0, 'olay  make space for women': 0, 'hard rock hotels & casinos  bling cup': 0, 'homeland trailer': 0, 'microsoft surface  be the one': 0}
test data size:1165
{'marvel  black widow trailer': 0, 'rocket mortgage  home': 0, 'olay  make space for women': 0, 'hard rock hotels & casinos  bling cup': 0, 'homeland trailer': 0, 'microsoft surface  be the one': 0}


In [17]:
# Top 10 ads in train data
print('Top 10 ads in train data')
train_data.groupby(by = 'labels').count().sort_values(by = 'sentences', ascending = False).head(11)

# print('Top 10 ads in test data')
# test_data.groupby(by = 'labels').count().sort_values(by = 'sentences', ascending = False).head(11)

Top 10 ads in train data


Unnamed: 0_level_0,sentences
labels,Unnamed: 1_level_1
60,2584
2,349
39,242
58,141
4,94
18,92
41,92
35,66
13,64
28,62


In [18]:
# def get_binary_samples(df):
#   df['ad_related'] = df['ad_manual_adjusted'].apply(lambda ad: 0 if ad == 'none' else 1)
#   # df['planters_baby_funeral'] = df['ad_manual_adjusted'].apply(lambda ad: 1 if ad == 'planters  baby funeral' else 0)
#   return df
# df_binary = get_binary_samples(df)

# print("Total samples: "+ str(df_binary.shape[0]))

# binary_train_size = int(0.8*df_binary.shape[0])
# binary_test_size = df_binary.shape[0] - binary_train_size
                
# # Get the lists of sentences and their labels.
# binary_sentences = df_binary.iloc[0:binary_train_size].text_clean.values
# binary_labels = df_binary.iloc[0:binary_train_size].ad_related.values
# # binary_labels = df_binary.iloc[0:binary_train_size].planters_baby_funeral.values

# print(binary_labels.shape)
# binary_test_data = df_binary.iloc[binary_train_size:df_binary.shape[0]+1]

# binary_test_size = binary_test_data.shape[0]
# print("train size: "+ str(binary_train_size)+" test size: "+str(binary_test_size))
                

In [19]:
# # use column name 'ad_manual_adjusted' of df to 
# def get_ad_related_twts(df, removeCommas = True):
#   df['ad_manual_adjusted'] = df['ad_manual_adjusted'].apply(lambda x: x.lower())
#   ad_filter = df['ad_manual_adjusted']!= 'none'
#   ad_related_twts = df[ad_filter]
#   if removeCommas:
#     ad_filter_1 = ~ad_related_twts['ad_manual_adjusted'].str.contains(',')
#     ad_related_twts = ad_related_twts[ad_filter_1]
#   return ad_related_twts

# def get_ad_id_dict(ad_related_twts): 
#   n_ad_related = ad_related_twts.shape[0]
#   print("# ad related tweets: "+ str(n_ad_related))
#   n_unique_ads = len(set(ad_related_twts['ad_manual_adjusted']))
#   print("# distinct ads:"+ str(n_unique_ads))

#   ads_annotated = ad_related_twts.ad_manual_adjusted.values
#   adset = set(ads_annotated)
#   print("unique ads:"+ str(len(adset)))
#   print(adset)
#   ad_id_dict = {}
#   i = 0
#   for ad in adset: 
#     ad_id_dict[ad] = i
#     i = i+1
#   print(ad_id_dict)
#   return ad_id_dict

# ad_related_twts = get_ad_related_twts(df)

# ad_id_dict = get_ad_id_dict(get_ad_related_twts(all_tweets))
# n_unique_ads = len(ad_id_dict)
# ad_related_twts['ad_manual_adjusted_id'] = ad_related_twts['ad_manual_adjusted'].apply(lambda x: ad_id_dict[x]) 


In [20]:
# # train test split
# n_ad_related = ad_related_twts.shape[0]
# train_size = int(0.8 * n_ad_related)
# test_size = n_ad_related - train_size
                
# # Get the lists of sentences and their labels.
# sentences = ad_related_twts.iloc[0:train_size].text_clean.values
# labels = ad_related_twts.iloc[0:train_size].ad_manual_adjusted_id.values

# test_data = ad_related_twts.iloc[train_size:n_ad_related+1]
# test_data['ad_manual_adjusted_id'] = test_data['ad_manual_adjusted'].apply(lambda x: ad_id_dict[x])

# test_size = test_data.shape[0]
# print("train size: "+ str(train_size)+" test size: "+str(test_size))
                

Transform our dataset into the format that BERT can be trained on

In [21]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('Loaded BERT tokenizer.')
vocab_tokens = list(tokenizer.vocab.keys())
print("Original Vocab size: " + str(len(vocab_tokens)))


Loaded BERT tokenizer.
Original Vocab size: 30522


In [22]:
import nltk
nltk.download('wordnet') # for wordnet lemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def get_lemmatize_words(plural_words):
  sing_words = []
  for word in plural_words:
    sing_words.append(wordnet_lemmatizer.lemmatize(word))
  return sing_words

words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
lem_words = get_lemmatize_words(words)
print(lem_words)

['apple', 'sheep', 'orange', 'cat', 'people', 'dice', 'penny', 'trump']


In [24]:
import pandas as pd 

def load_ad_data():
  ad_keywords = pd.read_csv('./SB_ad_annotations.csv')
  print(ad_keywords.columns)
  ad_keywords['keywords_clean'] = ad_keywords['Key Terms  Round 1'].apply(lambda x: cleanTweet(x))
  return ad_keywords

def get_keys_to_add(ad_keywords):
  ad_keywords = pd.read_csv('./SB_ad_annotations.csv')
  print(ad_keywords.columns)
  ad_keywords['keywords_clean'] = ad_keywords['Key Terms  Round 1'].apply(lambda x: cleanTweet(x)).apply(lambda x: x.split())
  ad_keys_clean = [word for tweet in ad_keywords['keywords_clean'] for word in tweet]
  ad_brands = ad_keywords['Brand Name'].apply(lambda x: cleanTweet(x)).apply(lambda x: x.split())
  ad_brands = [word for tweet in ad_brands for word in tweet]
  print("ad brands"+ str(ad_brands))
  ad_names = ad_keywords['Ad Name'].apply(lambda x: cleanTweet(x)).apply(lambda x: x.split())
  ad_names = [word for tweet in ad_names for word in tweet]
  product_names = ad_keywords['Product'].apply(lambda x: cleanTweet(x)).apply(lambda x: x.split())
  product_names = [word for tweet in product_names for word in tweet]
  keys_to_add = []
  keys_to_add.extend(ad_keys_clean)
  keys_to_add.extend(ad_brands)
  keys_to_add.extend(ad_names)
  keys_to_add.extend(product_names)
  print(len(keys_to_add))
  keys_to_add = list(set(keys_to_add))
  print(len(keys_to_add))
  return keys_to_add

ad_keywords = load_ad_data()
keys_to_add_raw = get_keys_to_add(ad_keywords)
print(keys_to_add_raw)
keys_to_add = get_lemmatize_words(keys_to_add_raw) # to make plurals to singular words
print(keys_to_add)

Index(['Ad Number', 'Brand Name', 'Ad Name', 'Product', 'Key Terms  Round 1',
       'KeyTerms_Edited_1', 'KeyTerms_Edited', 'Excitatory Potential',
       'Emotional vs. Rational', 'Semantic Affinity', 'Valence', 'Unnamed: 11',
       'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')
Index(['Ad Number', 'Brand Name', 'Ad Name', 'Product', 'Key Terms  Round 1',
       'KeyTerms_Edited_1', 'KeyTerms_Edited', 'Excitatory Potential',
       'Emotional vs. Rational', 'Semantic Affinity', 'Valence', 'Unnamed: 11',
       'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')
ad brands['trailer', 'quibi', 'tide', 'fox', 'presidential', 'campaign', 'walmart', 'marvel', 'rocket', 'porsche', 'snickers', 'hulu', 'fox', 'mountain', 'dew', 'squarespace', 'new', 'york', 'life', 'fox', 'hyundai', 'cheetos', 'olay', 'fox', 'michelob', 'avocados', 'mexico', 'hard', 'rock', 'pringles', 'turbotax', 'tide', 'genesis', 'coca', 'cola', 'planters', 'trailer', 'fox', 'google', 'sabra', 'weathertech', 'veriz

In [25]:
# Challenge: There are some words that are not separated although can be two legit words if separted
# lightyear, jasonmamoa, trumpsupporters, condimentfood, fixtheworld, budlightbudweiser, microsoftsurface, femalecoach, typicalamerican
# poptarts, quickbites, googleassistant, smahtpark, spacewalk, letitgo, bettahwaytopark, trumpforpresident, hardrockstadium,
# alpacinoprimevideo, femalecoach, inspirechange, motorshummer, newyorklife, stainlaundrydetergent

def get_missing_words_vocab(tokenizer, words):
  missing_words = []
  vocab = tokenizer.get_vocab()
  print("Actual word list of ad tokens:"+ str(len(words)))
  for word in words:
    if word not in vocab:
      missing_words.append(word)
  print(str(len(missing_words))+" missing_words")
  return missing_words

missing_words = get_missing_words_vocab(tokenizer, keys_to_add)
print(missing_words)

Actual word list of ad tokens:697
210 missing_words
['quicken', 'strangerthings', 'hardrock', 'beforealexa', 'morty', 'queereye', 'lebron', 'theshining', 'makespaceforwomen', 'inbev', 'doritos', 'fastfurious', 'erscoach', 'johnlegendchrissyteigen', 'tombrady', 'smahtpark', 'turbotax', 'jimmyfallon', 'chrissy', 'popsoda', 'snipesbabynut', 'scorsese', 'mtn', 'gop', 'porta', 'ringwald', 'quits', 'detergent', 'tpain', 'minion', 'maisie', 'rickmorty', 'stallone', 'adultswim', 'chancetherappermeganstallion', 'humus', 'arnett', 'condimentfood', 'snickersfixtheworld', 'giudice', 'olay', 'symptom', 'avocado', 'allfemale', 'allpeoplearetaxpeople', 'microsoftsurface', 'hummer', 'sugarbryan', 'koepka', 'mortgagemomoabowl', 'powersnoannualfeediscovercredit', 'letitgo', 'cheetos', 'cena', 'couric', 'rocketchrisrock', 'frozenaudi', 'procter', 'nutsmrpeanut', 'krasinki', 'pacino', 'bettahwaytopark', 'charliey', 'sexiest', 'fixtheworld', 'postmalone', 'hardrockcasino', 'usain', 'queensdragqueens', 'sel

In [26]:
def add_missing_tokens(tokenizer, missing_words):
  n_added = tokenizer.add_tokens(missing_words)
  print(str(n_added)+" tokens added to the vocab")
  return
add_missing_tokens(tokenizer, missing_words)

210 tokens added to the vocab


In [27]:
ad_keywords['ad_id'] = ad_keywords['Ad Name'].apply(lambda ad: convertAdNameToAdId(ad_id_dict, ad.lower()))

In [28]:

from transformers import BertForSequenceClassification, AdamW, BertConfig, BertModel
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
if classType == 'binary':
  num_labels = 2
else:
  print(n_unique_ads)
  num_labels = n_unique_ads
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = num_labels,
    # num_labels = n_unique_ads, # The number of output labels=2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)
model.resize_token_embeddings(len(tokenizer)) # since new tokens are added
# Tell pytorch to run this model on the GPU.
model.cuda()


61


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30732, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [29]:
# documentation of get_input_embeddings: https://huggingface.co/transformers/model_doc/bert.html
embedding_layer = model.get_input_embeddings()
print(embedding_layer)
pretrain_embeddings = embedding_layer.weight.cpu().detach().numpy()
print(pretrain_embeddings.shape)


Embedding(30732, 768)
(30732, 768)


In [30]:
# get random feature vector assuming normal distribution of values for each feature
def get_rand_feature_vector(mean, stand):
  feature_vector = np.zeros(len(mean))
  for i in range(0, len(mean)):
    mu = mean[i]
    sigma = stand[i]
    rand = np.random.normal(mu, sigma, 1)
    feature_vector[i] = rand
  return feature_vector

std = pretrain_embeddings.std(axis= 0) # to calculate variance for each column i.e each feature
mean = pretrain_embeddings.mean(axis =0) # to calculate mean for each column i.e each feature
print(std.shape)
print(mean.shape)


(768,)
(768,)


In [31]:
import numpy as np

missing_word_embeddings = np.zeros(shape = (len(missing_words), 768))
i=0
for word in missing_words:
  missing_word_embeddings[i] = get_rand_feature_vector(mean, std)
  i = i+1

print(missing_word_embeddings.shape)

updated_embeddings = np.concatenate((pretrain_embeddings, missing_word_embeddings), axis=0)
print("shape of updated embeddings:"+ str(updated_embeddings.shape))

(210, 768)
shape of updated embeddings:(30942, 768)


In [32]:
import torch
from torch import nn

input_embeddings = nn.Embedding(updated_embeddings.shape[0], updated_embeddings.shape[1], 
                             padding_idx= embedding_layer.padding_idx)
model.set_input_embeddings(input_embeddings)
model.get_input_embeddings()
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30942, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

Extracting Embeddings from Pretrained BERT, appending random embeddings for missing words in Vocab and using them as a layer in Keras whose input is then used to classify if a tweet is ad related or not

Ref1: https://stackabuse.com/python-for-nlp-word-embeddings-for-deep-learning-in-keras/

Ref2: https://machinelearningmastery.com/
use-word-embedding-layers-deep-learning-keras/



In [33]:
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten, Dropout
# from keras.layers.embeddings import Embedding

# custom_model = Sequential()
# # embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=length_long_sentence, trainable=False)
# length_long_sentence = 45
# input_embedding_layer = Embedding(updated_embeddings.shape[0], updated_embeddings.shape[1], 
#                                   weights=[updated_embeddings], 
#                                   input_length=length_long_sentence, trainable = True)

# custom_model.add(input_embedding_layer) # add the Embedding layer as the first layer to the model
# # custom_model.add(Dropout(0.2)) # drop out layer to avoid over fitting
# custom_model.add(Flatten()) # Embedding layer is flattened so that it can be directly used with the densely connected layer.
# custom_model.add(Dense(1, activation='sigmoid')) # binary classification problem, we use the sigmoid function as the loss function at the dense layer
# custom_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# print(custom_model.summary())

In [34]:
# Tokenize all of the sentences and map the tokens to their word IDs.
def get_input_ids(sentences):
  input_ids = []
  # For every sentence...
  for sent in sentences:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          # max_length = 128,          # Truncate all sentences.
                          # return_tensors = 'pt',     # Return pytorch tensors.
                    )
      # print(sent+":")
      # print(tokenizer.tokenize(sent))
      # print('encoded_sent:'+ str(encoded_sent))
      input_ids.append(encoded_sent)
  return input_ids

if classType == 'binary':
  print('binary')
  input_ids = get_input_ids(binary_sentences)
else:
  print('multi class')
  input_ids = get_input_ids(sentences)
  print(len(input_ids))

multi class
4659


In [35]:
# from keras.preprocessing.sequence import pad_sequences

# def get_padded_Sentences(sentences): # to have all sentences of same length
#   binary_sentences = sentences
#   print("len of bin_sentences:"+ str(len(binary_sentences)))
#   binary_input_ids = []
#   binary_input_ids = get_input_ids(binary_sentences) 
#   print(len(binary_input_ids))
#   length_long_sentence = 45 # same as the max length used while initialising the model
#   print('Max sentence length: ', max([len(sen) for sen in binary_input_ids]))
#   padded_sentences = pad_sequences(binary_input_ids, length_long_sentence, padding='post')
#   print(padded_sentences)
#   return padded_sentences

In [36]:
# train_binary_sentences = get_padded_Sentences(binary_sentences)

In [37]:
# custom_model.fit(train_binary_sentences, binary_labels, epochs=5, verbose=1)


In [38]:
# loss, accuracy = custom_model.evaluate(train_binary_sentences, binary_labels, verbose=0)
# print('Training Accuracy: %f' % (accuracy*100))

In [39]:
# test_binary_sentences = get_padded_Sentences(binary_test_data.text_clean)
# test_binary_labels = binary_test_data.ad_related.values

In [40]:
# predictions = custom_model.predict(test_binary_sentences)
# print(predictions.shape)

In [41]:
# from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

# def get_binary_pred_labels(predictions):
#   pred_labels = []
#   for pred in predictions:
#     if pred[0]>=0.5 :
#       pred_labels.append(1)
#     else:
#       pred_labels.append(0)
#   return pred_labels

# def get_binary_metrics(test_binary_labels, pred_labels):
#   f1score = f1_score(test_binary_labels, pred_labels, average='binary') # micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
#   print('f1_score:'+ str(f1score))

#   prec = precision_score(test_binary_labels, pred_labels, average='binary')
#   print('Precision:'+ str(prec))

#   acc = accuracy_score(test_binary_labels, pred_labels)
#   print("Accuracy: "+ str(acc))

#   recall = recall_score(test_binary_labels, pred_labels, average='binary')
#   print("recall: "+ str(recall))

#   confusionmatrix = confusion_matrix(test_binary_labels, pred_labels)
#   tn, fp, fn, tp = confusionmatrix.ravel()
#   print("tp: "+ str(tp)+" tn: "+ str(tn)+" fp: "+ str(fp)+" fn: "+ str(fn))

#   return f1score, prec, acc, recall, confusionmatrix
# # classification_report(true_labels_flat, pred_labels)

# pred_labels = get_binary_pred_labels(predictions)
# f1score, prec, acc, recall, _ = get_binary_metrics(test_binary_labels, pred_labels)

In [42]:
# input_ids = get_input_ids(sentences)
if classType == 'binary':
  print('binary')
  input_ids = get_input_ids(binary_sentences)
else:
  print('multi class')
  input_ids = get_input_ids(sentences)
  print(len(input_ids))

print('Max sentence length: ', max([len(sen) for sen in input_ids]))

multi class
4659
Max sentence length:  35


In [43]:
#let’s choose MAX_LEN = 64 and apply the padding
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences
# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 34...
MAX_LEN = 64
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('\nDone.')


Padding/truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0

Done.


The attention mask simply makes it explicit which tokens are actual words versus which are padding.
The BERT vocabulary does not use the ID 0, so if a token ID is 0, then it’s padding, and otherwise it’s a real token.

In [44]:
# Create attention masks
attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [45]:
#training and validation split

from sklearn.model_selection import train_test_split
# Use 90% for training and 10% for validation.

if classType == 'binary':
  input_labels = binary_labels
else:
  input_labels = labels

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, input_labels, 
                                                            # random_state=2018, 
                                                            test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_labels,
                                            #  random_state=2018, 
                                             test_size=0.1)

In [46]:
#Converting to PyTorch Data Types
# Convert all inputs and labels into torch tensors, the required datatype for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [47]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

let’s load BERT! There are a few different pre-trained BERT models available. “bert-base-uncased” means the version that has only lowercase letters (“uncased”) and is the smaller version of the two (“base” vs “large”)



In [48]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30942, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

Optimizer & Learning Rate Scheduler:
Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.
For the purposes of fine-tuning, the authors recommend choosing from the following values:
Batch size: 16, 32 (We chose 32 when creating our DataLoaders).
Learning rate (Adam): 5e-5, 3e-5, 2e-5 (We’ll use 2e-5).
Number of epochs: 2, 3, 4 (We’ll use 4).
The epsilon parameter eps = 1e-8 is “a very small number to prevent any division by zero in the implementation”




In [49]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Define a helper function to find accuracy

In [50]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

Helper function for formatting elapsed times.


In [51]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Kick off training

In [52]:
import random
import numpy as np
import tensorflow as tf
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so we can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask
                     ,labels=b_labels)
                  # )        
        # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0. This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
       
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")


Training...
  Batch    40  of    132.    Elapsed: 0:00:17.
  Batch    80  of    132.    Elapsed: 0:00:34.
  Batch   120  of    132.    Elapsed: 0:00:51.

  Average training loss: 2.44
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.54
  Validation took: 0:00:02

Training...
  Batch    40  of    132.    Elapsed: 0:00:17.
  Batch    80  of    132.    Elapsed: 0:00:34.
  Batch   120  of    132.    Elapsed: 0:00:51.

  Average training loss: 2.10
  Training epoch took: 0:00:56

Running Validation...
  Accuracy: 0.55
  Validation took: 0:00:02

Training...
  Batch    40  of    132.    Elapsed: 0:00:17.
  Batch    80  of    132.    Elapsed: 0:00:35.
  Batch   120  of    132.    Elapsed: 0:00:52.

  Average training loss: 1.92
  Training epoch took: 0:00:57

Running Validation...
  Accuracy: 0.57
  Validation took: 0:00:02

Training...
  Batch    40  of    132.    Elapsed: 0:00:17.
  Batch    80  of    132.    Elapsed: 0:00:34.
  Batch   120  of    132.    Elapsed: 0:00:5

In [53]:
# import plotly.express as px
# f = pd.DataFrame(loss_values)
# f.columns=['Loss']
# fig = px.line(f, x=f.index, y=f.Loss)
# fig.update_layout(title='Training loss of the Model',
#                    xaxis_title='Epoch',
#                    yaxis_title='Loss')
# fig.show()

Run the model on tweets that are not manually annotated

In [54]:
if classType == 'binary':
  binary_test_sentences = input.tweet_text.values
else:
  binary_output =  pd.read_csv('./app_4_bin_class_no_man_ann.csv')
  binary_output = binary_output[binary_output['ad_related']==1]
  test_sentences = binary_output.tweet_text.values

labels_present = False

FileNotFoundError: ignored

In [55]:
labels_present = True

In [56]:
import pandas as pd

# Get the lists of sentences and their labels.
# binart test data
if classType == 'binary':
  sentences = binary_test_sentences
  labels = binary_test_labels
else:
  print('multi class')
  sentences = test_sentences
  labels = test_labels

#  multi label data 
# sentences = test_data.text_clean.values
# labels = test_data.ad_manual_adjusted_id.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
if labels_present:
  prediction_labels = torch.tensor(labels)
# Set the batch size.  
batch_size = 32  
# Create the DataLoader.
if labels_present:
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
else:
  prediction_data = TensorDataset(prediction_inputs, prediction_masks)

prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


multi class


In [57]:
# Prediction on test set
print('Predicting labels for {:,} test sentences.'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  if labels_present:
    b_input_ids, b_input_mask, b_labels = batch
  else:
    b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  # Store predictions and true labels
  predictions.append(logits)
  if labels_present:
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
print('DONE.')

Predicting labels for 1,165 test sentences.
DONE.


In [58]:
def flatten_labels(true_labels):
  flat_labels =[]
  for i in range(len(true_labels)):
    flat_labels.extend(true_labels[i].flatten())
  return flat_labels

def calculate_pred_labels(predictions):
  tp_count =0
  pred_labels = []
  # For each input batch...
  print(" len of predictions: " + str(len(predictions)))
  for i in range(len(predictions)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" and one column for "1"). 
    # Pick the label with the highest value and turn this in to a list of 0s and 1s.
    #print(predictions[i])
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    #print(pred_labels_i)
    pred_labels.extend(pred_labels_i)
  return pred_labels

print(len(true_labels))
print(len(predictions))

pred_labels = calculate_pred_labels(predictions)
print(len(pred_labels))

if labels_present:
  true_labels_flat = flatten_labels(true_labels)
  print(len(true_labels_flat))


37
37
 len of predictions: 37
1165
1165


In [59]:
# input = pd.read_csv('./remaining_tweets_non_man.csv')
if ~labels_present:
  if classType == 'binary':
    input['ad_related'] = pred_labels
    input.to_csv('./app_4_bin_class_no_man_ann.csv')
  else:
    binary_output['ad_id_predicted'] = pred_labels
    binary_output['ad_predicted'] = binary_output['ad_id_predicted'].apply(lambda id: list(ad_id_dict.keys())[id])
    binary_output.to_csv('./app_4_multi_class_no_man_ann.csv')

NameError: ignored

In [60]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

def getMetrics(true_labels_flat, pred_labels, averageType):
  print("Evaluating metrics as per '"+averageType+"' average type")
  f1score = f1_score(true_labels_flat, pred_labels, average= averageType) 
  print('f1_score:'+ str(f1score))

  prec = precision_score(true_labels_flat, pred_labels, average=averageType)
  print('Precision:'+ str(prec))

  acc = accuracy_score(true_labels_flat, pred_labels)
  print("Accuracy: "+ str(acc))

  recall = recall_score(true_labels_flat, pred_labels, average=averageType)
  print("recall: "+ str(recall))

  # classification_report(true_labels_flat, pred_labels)
  confusionmatrix = confusion_matrix(true_labels_flat, pred_labels)
  # print(confusionmatrix)
  # if averageType == 'binary':
    # tn, fp, fn, tp = confusionmatrix.ravel()
    # print("tp: "+ str(tp)+" tn: "+ str(tn)+" fp: "+ str(fp)+" fn: "+ str(fn))

if classType == 'binary':
  averageType = 'binary'
else:
  averageType = 'weighted'
  # micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
  # macro: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
  # weighted : Calculate metrics for each label, and find their average weighted by support
  #  (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; 
  # it can result in an F-score that is not between precision and recall.

getMetrics(true_labels_flat, pred_labels, averageType)

Evaluating metrics as per 'weighted' average type
f1_score:0.47931154536288517
Precision:0.48061383337321933
Accuracy: 0.5785407725321888
recall: 0.5785407725321888


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
end here

In [None]:
# save the model
import os
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
# BERT_MODEL_PATH = "./model_save"

# fine_tuned_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)
# fine_tuned_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_PATH)


In [None]:
# import inflect
# p = inflect.engine()

# def get_singular_words(plural_words):
#   sing_words = []
#   for word in plural_words:
#     sing_words.append(p.singular_noun(word))
#   return sing_words

# words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
# sing_words = get_singular_words(words)
# print(sing_words)

In [None]:
# from nltk.stem import PorterStemmer

# porter=PorterStemmer()
# def get_stem_words(plural_words):
#   sing_words = []
#   for word in plural_words:
#     sing_words.append(porter.stem(word))
#   return sing_words

# words = ["apples", "sheep", "oranges", "cats", "people", "dice", "pence", "trump"]
# stem_words = get_stem_words(words)
# print(stem_words)

In [None]:
# https://huggingface.co/transformers/main_classes/tokenizer.html
# add words that are not in pre trained vocab as tokens

# added_tokens = []
# for key in ad_keys_clean:
#   if key not in vocab_tokens:
#     n = tokenizer.add_tokens(key)
#     if n==1:
#       added_tokens.append(key)
# print('added '+ str(len(added_tokens)) +' to the vocab')
# print("added words:"+ str(added_tokens))

In [None]:
# #  check if all words in data corpus are present in the vocab
# import pandas as pd
# def add_tokens(add_tokens, df):
#   tokens_df = pd.DataFrame() 
#   tokens_df['text_clean'] = df['text_clean']
#   tokens = []

#   vocab_tokens = list(tokenizer.vocab.keys()) # tokens present in the vocab
#   print("Vocab size: " + str(len(vocab_tokens)))

#   # add_tokens = ['donald', 'trump', 'bryant', 'shakira', 'avosForMexico', 'jlo', 'j lo', 'pringles', 'budweiserusa', 'budweiser']
#   n_tokens_added = tokenizer.add_tokens(add_tokens) # adds only if tokens are not already present
#   print(str(n_tokens_added)+" tokens added")

#   indexed_tokens = tokenizer.convert_tokens_to_ids(add_tokens)
#   print(indexed_tokens)

#   for add_token in add_tokens:
#     if add_token in vocab_tokens:
#       print(add_token + ' present in tokens')

#   # model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the 
#   #  full size of the new vocabulary, i.e. the length of the tokenizer.
#   for twt in tokens_df['text_clean']:
#     # print(twt)
#     if twt is not None :
#       tokens.append(tokenizer.tokenize(str(twt)))
#     else:
#       tokens.append([])
#   tokens_df['tokens'] = tokens
#   print(tokens_df.head(100))
# # tokens_df['tokens'] = tokens_df['text_clean'].apply(lambda x: loc_tokenizer.tokenize(x))


In [None]:
# # add embeddings
# from transformers import BertModel

# bert_model = BertModel.from_pretrained('bert-base-uncased',
#                                   output_hidden_states = True) 
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# input_ids = []
# for sent in sentences:
#   encoded_dict = bert_tokenizer.encode_plus(
#                   sent,                      # Sentence to encode.
#                   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                   return_tensors = 'pt',     # Return pytorch tensors.
#                 )
#   print(encoded_dict)
#   print(sent+":")
#   print(tokenizer.tokenize(sent))
#   print('encoded_sent:'+ str(encoded_sent))

In [None]:
# from transformers import DistilBertForTokenClassification
# model = DistilBertForTokenClassification.from_pretrained(
#     "distilbert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
#     num_labels = 128, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.   
#     output_attentions = False, # Whether the model returns attentions weights.
#     output_hidden_states = False, # Whether the model returns all hidden-states.
# )
# # Tell pytorch to run this model on the GPU.
# model.cuda()