## Sentence/Word Embeddings Ectraction for downline activities


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import pandas as pd
df = pd.read_csv("drive/MyDrive/Colab Notebooks/dataset/all_tweets_text.csv")
print(df.shape)
print(df.columns)
df['text']= df['text'].astype('str')
#df = df[8242:]
#df = df[:37]
sentences = df.text.values
print(sentences[8])
print(sentences.shape)

'''
import pandas as pd
# Reading Data into dataFrame
text = pd.read_csv("drive/MyDrive/Colab Notebooks/dataset/statuses_unicode.txt", header=None, names=['sentence'])
big5 = pd.read_csv("drive/MyDrive/Colab Notebooks/dataset/big5labels.txt", delimiter=" ", header=None, names=['O', 'C', 'E', 'A', 'N'])
df = pd.concat([text, big5], axis=1, sort=False)
#df = df[:32]
print(df.shape)
print(df.sample(5))
df['sentence']= df['sentence'].astype('str')
sentences = df.sentence.values
print(sentences[8])
print(sentences.shape)
'''

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
                'bert-base-multilingual-cased',
                do_lower_case=False)

In [None]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

In [None]:
max_len = 0

# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
max_len = 256 # the closest power of two exceeding max len found
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
#@title Testo del titolo predefinito
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-multilingual-cased")
model.cuda()

In [None]:
import numpy as np
import sys
from scipy.spatial import distance

bhv_centroids = np.load("drive/MyDrive/Colab Notebooks/dataset/bhv_centroids.npy")

batch_size = 16
start_range = 0
print(input_ids.shape[0])
bhv_results = []

while (start_range + batch_size) < input_ids.shape[0]:
  inputs = input_ids[start_range: (start_range + batch_size)]
  outputs = model(inputs.to(device))
  #print(outputs.last_hidden_state.shape)
  #calcola bhv qui
  out_arr = outputs.last_hidden_state.cpu().detach().numpy()
  out_arr_asa = np.asarray(out_arr)

  
  for sentence in out_arr_asa:
      valid_token = 0.0
      bhv_scores = np.zeros(10)
      for token in sentence:
          min_distance = sys.float_info.max
          pos = 0
          if abs(token[0]) < 0.0000001:
              continue
          else:
              for i in range(10):
                  now_distance = distance.euclidean(np.asarray(token), np.asarray(bhv_centroids[i]))
                  #print(now_distance)
                  if now_distance < min_distance:
                      min_distance = now_distance
                      pos = i
              bhv_scores[pos] = bhv_scores[pos]+1.0
              valid_token = valid_token + 1.0
      #print("bhv_scores and valid token", bhv_scores, valid_token)
      bhv_scores = bhv_scores / valid_token
      #print("bhv_scores after div", bhv_scores)
      bhv_results.append(bhv_scores)

  start_range += batch_size


inputs = input_ids[start_range:]
outputs = model(inputs.to(device))
#calcola bhv anche qui
out_arr = outputs.last_hidden_state.cpu().detach().numpy()
out_arr_asa = np.asarray(out_arr)

for sentence in out_arr_asa:
    valid_token = 0.0
    bhv_scores = np.zeros(10)
    for token in sentence:
        min_distance = sys.float_info.max
        pos = 0
        if abs(token[0]) < 0.0000001:
            continue
        else:
            for i in range(10):
                now_distance = distance.euclidean(np.asarray(token), np.asarray(bhv_centroids[i]))
                #print(now_distance)
                if now_distance < min_distance:
                    min_distance = now_distance
                    pos = i
            bhv_scores[pos] = bhv_scores[pos]+1.0
            valid_token = valid_token + 1.0
    bhv_scores = bhv_scores / valid_token
    bhv_results.append(bhv_scores)


# print("bhv_results ", bhv_results)
bhv_res = np.asarray(bhv_results)
np.savetxt("drive/MyDrive/Colab Notebooks/dataset/BHV/all_bhv.csv", bhv_res, delimiter=",", fmt='%5.5f')

# BHV with GloVe 

In [None]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
!unzip -q glove.twitter.27B.zip -d 'drive/MyDrive/Colab Notebooks/dataset/GloVe'

In [None]:
import numpy as np

def loadGloveModel(gloveFile):     ##200d
    print ("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
localModel = loadGloveModel('drive/MyDrive/Colab Notebooks/dataset/GloVe/glove.twitter.27B.200d.txt')

In [None]:
selfdirection = ["creativity", "freedom", "curious", "independent", "self-respect", "intelligent", "privacy"]
stimulation = ["excitement", "novelty", "challenge", "variety", "stimulation", "daring"]
hedonism = ["pleasure", "sensuous",  "gratification", "enjoyable", "self-indulgent"]
achievement = ["ambitious", "successful", "capable", "influential", "intelligent", "self-respect"] 
power = ["authority", "wealth", "power", "reputation", "notoriety"]
security = ["safety", "harmony", "stability", "order", "security", "clean", "reciprocation", "healthy", "moderate", "belonging"]
conformity = ["obedient", "self-discipline", "politeness", "honoring" , "loyal", "responsible"]
tradition = ["tradition", "humble", "devout", "moderate", "spiritualist"]
benevolence = ["helpful", "honest", "forgiving", "responsible", "loyal", "friendship", "love", "meaningful"]
universalism = ["broadminded", "justice", "equality", "peace", "beauty", "wisdom", "environmentalist", "harmony"]

schwartzBasicHumanValues = [selfdirection, stimulation, hedonism, achievement, power, security, conformity, tradition, benevolence, universalism]
schwartzNames = ["selfdirection", "stimulation", "hedonism", "achievement", "power", "security", "conformity", "tradition", "benevolence", "universalism"]

pos = 0
schwartzCentroids = {}

for humanValue in schwartzBasicHumanValues:
	count_elements = 0.0
	schwartzNCentroid = [0.0]
	schwartzNCentroid = schwartzNCentroid*200
	schwartzNCentroid = np.asarray(schwartzNCentroid)
	for representativeWord in humanValue:
		schwartzNCentroid = schwartzNCentroid + np.asarray(localModel[representativeWord])
		count_elements +=1
	schwartzCentroids[schwartzNames[pos]] = schwartzNCentroid/count_elements
	pos +=1
print ("Centroids computed!")

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import nltk

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
import re

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

NON_BMP_RE = re.compile(u"[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]", flags=re.UNICODE)


In [None]:
import sys
from scipy.spatial import distance

bhv_results = []

for sentence in sentences:
  total_words = {}
  words_total = 0.0
  bhv_scores = np.zeros(10)
  for category in schwartzNames:
    total_words[category] = 0

  
  doc_cleaned = clean(sentence)
  # print(doc_cleaned)
  for word in doc_cleaned:
    if word.startswith('@') or word.isdigit() or ("http" in word):
      continue
    else:
      word = NON_BMP_RE.sub('', word)
      if len(word)>0 and word in localModel:
        words_total += 1
        min_distance = sys.float_info.max
        which_schwartz = ""
        for pos in schwartzNames:
          now_distance = distance.euclidean(np.asarray(localModel[word]), schwartzCentroids[pos])
          if now_distance<min_distance:
            min_distance = now_distance
            which_schwartz = pos
        total_words[which_schwartz] += 1
  pos = 0
  if words_total == 0.0:
    bhv_results.append(bhv_scores)
  else:
    for category in schwartzNames:
      bhv_scores[pos] = total_words[category]/words_total
      pos +=1
    bhv_results.append(bhv_scores)
  
print ("bhv computed successfully")
bhv_res = np.asarray(bhv_results)
np.savetxt("drive/MyDrive/Colab Notebooks/dataset/BHV/glove_all_bhv_all_tweets.csv", bhv_res, delimiter=",", fmt='%5.5f')