In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'
try:
    os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
except OSError:
    pass
sys.path.insert(0,nb_path)


Mounted at /content/drive


In [2]:
#!pip install --target=$nb_path transformer-srl==2.4.6

import transformer_srl
import pandas as pd
import numpy as np
from transformer_srl import dataset_readers, models, predictors
import re
from spacy.lemmatizer import Lemmatizer
import spacy 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
predictor = predictors.SrlTransformersPredictor.from_path("/content/drive/MyDrive/Northwell Pipelines/Shared Pipeline Files/srl_bert_base_conll2012.tar.gz","transformer_srl")
nlp = spacy.load("en_core_web_sm")
pronouns_dic = {'i': 'i', 'he':'he', 'she': 'she', 'you':'you', 'we': 'we', 'they':'they','me':'i', 'my':'i', 'mine': 'i', 'your':'you', 'yours':'you','him':'he', 'his':'he', 'her':'she', 'hers':'she', 'us':'we', 'ours':'we', 'our':'we', 'their':'they', 'theirs':'they', 'them':'they', 'its':'it', 'it':'it', "'em":'they', 'myself':'I', 'that': 'that', 'this':'this', 'those':'those', 'these': 'these'}
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]



In [4]:
def semantic_graphs(sentence_instance):
  semroles = predictor.predict(sentence_instance)

  semantic_content = []

  for i in range(0, len(semroles['verbs'])):
    semantic_elements = []
    semroles_list = semroles['verbs'][i]['description']
    semroles_elements = re.findall(r'\[.*?\]', semroles_list)
    verb_elements = semroles['verbs'][i]['verb']
    verb_spacy = nlp(verb_elements)
    for token in verb_spacy:
      semantic_elements.append(token.lemma_)

    #print(semroles_elements)
    for j in range(0, len(semroles_elements)):
      semantic_roles = []
      if len(semroles_elements[j][1:-1].split(": ")) > 1:
        units = semroles_elements[j][1:-1].split(": ")
        units[1] = units[1].split(' ')

      for k in range (0, len(units[1])):
        #print(units[1][k])
        if units[1][k].lower() in pronouns_dic.keys():
          units[1][k] = pronouns_dic[units[1][k].lower()]
        else:
          nouns_spacy = nlp(units[1][k])
          for token in nouns_spacy:
            units[1][k] = token.lemma_

      if (units[0] == 'ARG0') | (units[0] == 'ARG1') | (units[0] == 'ARG2'):
        semantic_roles.append(units)
        #print(semantic_roles)
      if semantic_roles != []:
        #print(semantic_roles[0])
        semantic_elements.append(semantic_roles[0])
    #print(semantic_elements)
    semantic_content.append(semantic_elements)

  sorted_semantic_content = []

  for each in semantic_content:
    sorted_semantic_segment = [np.nan , np.nan , np.nan, np.nan]
    sorted_semantic_segment[0] = each[0]
    for argument in each[1:]:
      if argument[0] == 'ARG0':
        sorted_semantic_segment[1] = argument[1]
      elif argument[0] == 'ARG1':
        sorted_semantic_segment[2] = argument[1]
      elif argument[0] == 'ARG2':
        sorted_semantic_segment[3] = argument[1]

    sorted_semantic_content.append(sorted_semantic_segment)

  sorted_semantic_content
  #she gave me red flowers. I read the books.

  semantic_df = pd.DataFrame(data = sorted_semantic_content, columns = ['predicate','actor', 'undergoer1', 'undergoer2'])

  action_relations = semantic_df.dropna(subset=['actor']).dropna(subset=['undergoer1', 'undergoer2'], how='all')
  action_relations.fillna('isnull', inplace=True)

  ap_matrix = {}
  for index, row in action_relations.iterrows():
    actors = []
    undergoers = []
    for each_actor in row['actor']:
      if each_actor in pronouns_dic.keys():
        actors.append(each_actor) 
      elif each_actor not in stop_words:
        actors.append(each_actor)
    if row['undergoer2'] == 'isnull':
      for each_undergoer in row['undergoer1']:
        if each_undergoer in pronouns_dic.keys():
          undergoers.append(each_undergoer)
        elif each_undergoer not in stop_words:
          undergoers.append(each_undergoer)
    elif row['undergoer1'] == 'isnull':
      for each_undergoer in row['undergoer2']:
        if each_undergoer in pronouns_dic.keys():
          undergoers.append(each_undergoer)
        elif each_undergoer not in stop_words:
          undergoers.append(each_undergoer)
    else:
      for each_undergoer in row['undergoer1']:
        if each_undergoer in pronouns_dic.keys():
          undergoers.append(each_undergoer)
        elif each_undergoer not in stop_words:
          undergoers.append(each_undergoer)
      for each_undergoer in row['undergoer2']:
        if each_undergoer in pronouns_dic.keys():
          undergoers.append(each_undergoer)
        elif each_undergoer not in stop_words:
          undergoers.append(each_undergoer)
    for each_agent in actors:
      for each_patient in undergoers:
        ap_pair = (each_agent, each_patient)
        ap_matrix[ap_pair] = ap_matrix.get(ap_pair, 0) + 1

  predicate_relations = semantic_df.fillna('isnull')

  pa_matrix = {}
  for index, row in predicate_relations.iterrows():
    predicates = []
    arguments = []
    predicates.append(row['predicate']) 
    if row['actor'] != 'isnull':
      for each_argument in row['actor']:
        if each_argument in pronouns_dic.keys():
          arguments.append(each_argument)
        elif each_argument not in stop_words:
          arguments.append(each_argument)
    if row['undergoer1'] != 'isnull':
      for each_argument in row['undergoer1']:
        if each_argument in pronouns_dic.keys():
          arguments.append(each_argument)
        elif each_argument not in stop_words:
          arguments.append(each_argument)
    if row['undergoer2'] != 'isnull':
      for each_argument in row['undergoer2']:
        if each_argument in pronouns_dic.keys():
          arguments.append(each_argument)
        elif each_argument not in stop_words:
          arguments.append(each_argument)     
    for each_predicate in predicates:
      for each_argument in arguments:
        pa_pair = (each_predicate, each_argument)
        pa_matrix[pa_pair] = pa_matrix.get(pa_pair, 0) + 1

  semantic_matrix = {}
  semantic_matrix
  for relation in ap_matrix.keys():
    semantic_matrix[relation] = semantic_matrix.get(relation, 0) + ap_matrix[relation]
  for relation in pa_matrix.keys():
    semantic_matrix[relation] = semantic_matrix.get(relation, 0) + pa_matrix[relation]
  return semroles, ap_matrix, pa_matrix, semantic_matrix

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Northwell Pipelines/Youtube Pipeline/7_filtered_words/filtered_words_youtube.csv')

df = df[df['sp.pos'].notna()]


df_open_tokens =  df[['uid', 'task','sentence_id', 'token']]

open_extracted = []


subjects_list = df_open_tokens['uid'].unique()
subjects_list
for i in range (0, len(subjects_list)):
  subject_speech = df_open_tokens.loc[df_open_tokens['uid'] == subjects_list[i]]
  task_list = subject_speech['task'].unique()
  for k in range (0, len(task_list)):

    task_speech = subject_speech.loc[subject_speech['task'] == task_list[k]]
    open_extracted_each = []
    sentence_list = task_speech['sentence_id'].unique()
    each_speech = []
    for j in range(0, len(sentence_list)):
      sentence_entry = []
      subject_sentence = task_speech.loc[task_speech['sentence_id'] == sentence_list[j]]
      #subject_sentence
      each_sentence = []
      for index, row in subject_sentence.iterrows():
        each_sentence.append(str(row['token']))
        #print(each_sentence)
      extracted_sentence = ' '.join(each_sentence) + '.'
      sentence_entry.append(subjects_list[i])
      sentence_entry.append(subject_sentence['task'].unique()[0])
      sentence_entry.append(sentence_list[j])
      sentence_entry.append(extracted_sentence)
      open_extracted.append(sentence_entry)

open_speech_df = pd.DataFrame(data=open_extracted , columns =['uid' ,'task', 'sentence_id', 'content'])
open_speech_df.head(5)

Unnamed: 0,uid,task,sentence_id,content
0,Y4_S9,journaling,1,the psychiatrist did not.
1,Y4_S9,journaling,7,i am not completely like other people.
2,Y4_S9,journaling,9,people dislike me because i am not completely ...
3,Y4_S9,journaling,11,i am trying to do with my life something which...
4,Y4_S9,journaling,12,and this influences my thinking and consequent...


In [6]:
open_speech_df['uid'] = open_speech_df['uid'].astype(str)
open_speech_df['task'] = open_speech_df['task'].astype(str)
open_speech_df['key'] = open_speech_df['uid'] + '_' + open_speech_df['task']
len(open_speech_df['key'].unique())

48

In [7]:
sentence_level_relations = []
for index, row in open_speech_df.iterrows():
  each_sentence_entry = []
  speech = row['content']
  semroles, ap_matrix, pa_matrix, semantic_matrix = semantic_graphs(speech)
  each_sentence_entry.append(row['uid'])
  each_sentence_entry.append(row['task'])
  each_sentence_entry.append(row['sentence_id'])
  each_sentence_entry.append(row['content'])
  each_sentence_entry.append(ap_matrix)
  each_sentence_entry.append(pa_matrix)
  each_sentence_entry.append(semantic_matrix)
  each_sentence_entry.append(semroles)
  sentence_level_relations.append(each_sentence_entry)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [8]:
sentence_level_df = pd.DataFrame(data=sentence_level_relations, columns=['uid','task', 'sentence_id', 'content', 'ap_relations', 'pa_relations', 'semantic_relations', 'srl'])
sentence_level_df
sentence_level_df.to_csv('/content/drive/MyDrive/Northwell Pipelines/Youtube Pipeline/8_semantic_roles/srl_sentence_level_youtube.csv')

In [9]:
sentence_level_df.head(5)

Unnamed: 0,uid,task,sentence_id,content,ap_relations,pa_relations,semantic_relations,srl
0,Y4_S9,journaling,1,the psychiatrist did not.,{},{},{},"{'verbs': [{'verb': 'did', 'description': 'the..."
1,Y4_S9,journaling,7,i am not completely like other people.,{},"{('be', 'i'): 1, ('be', 'completely'): 1, ('be...","{('be', 'i'): 1, ('be', 'completely'): 1, ('be...","{'verbs': [{'verb': 'am', 'description': '[ARG..."
2,Y4_S9,journaling,9,people dislike me because i am not completely ...,"{('people', 'i'): 1}","{('dislike', 'people'): 1, ('dislike', 'i'): 1...","{('people', 'i'): 1, ('dislike', 'people'): 1,...","{'verbs': [{'verb': 'dislike', 'description': ..."
3,Y4_S9,journaling,11,i am trying to do with my life something which...,"{('i', 'i'): 2, ('i', 'life'): 2, ('i', 'somet...","{('try', 'i'): 2, ('try', 'life'): 1, ('try', ...","{('i', 'i'): 2, ('i', 'life'): 2, ('i', 'somet...","{'verbs': [{'verb': 'am', 'description': 'i [b..."
4,Y4_S9,journaling,12,and this influences my thinking and consequent...,"{('this', 'i'): 2, ('this', 'think'): 1, ('thi...","{('influence', 'this'): 1, ('influence', 'i'):...","{('this', 'i'): 2, ('this', 'think'): 1, ('thi...","{'verbs': [{'verb': 'influences', 'description..."
