In [2]:
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
import en_core_web_sm
nlp = en_core_web_sm.load()
from tqdm import tqdm
tqdm.pandas()

In [3]:
tweets = pd.read_csv('../data/tweets.tsv', sep='\t')

In [4]:
predicates = ['affect', 'affects',
              'cause', 'causes',
              'inhibit', 'inhibits',
              'prevent', 'prevents',
              'treat', 'treats',
              'lead to', 'leads to',
              'increase', 'increases',
              'decrease', 'decreases',
              'facilitate', 'facilitates',
              'hinder', 'hinders',
              'stop', 'stops',
              'associated with',
              'correlated with',
              'enable', 'enables',
              'are a',
              'need more', 'needs more',
              'need less', 'needs less',
              'support', 'supports',
              'lower', 'lowers',
              'promote', 'promotes',
              'process of', 'reason for', 'reason why', 'higher than', 'lower than']

def contains_arg_relation(tweet_sentence):
    for pred in predicates:
      if re.match('.*\s('+pred+')\s.{2,}', tweet_sentence) is not None:
        return pred
    return ""


def is_claim(tweet):
  #tweet = 'Researchers found'

  tweet = tweet.lower()
  pred = contains_arg_relation(tweet)
  if pred != "":

    sentences = sent_tokenize(tweet)

    for sent in sentences:
      doc = nlp(sent)

      if " "+pred+" " in sent:
        tags = [token.tag_ for token in doc]
        poss = [token.pos_ for token in doc]
        ents = [token.ent_type_ for token in doc]
        texts = [token.lower_ for token in doc]

        if len(pred.split(" ")) > 1:
          pred_index = texts.index(pred.split(" ")[0])
        else:
          pred_index = texts.index(pred)

        #if (pred == "support" and poss[pred_index] != 'NOUN') or pred != "support":
        tags_before = tags[:pred_index]
        poss_before = poss[:pred_index]
        ents_before = ents[:pred_index]

        tags_after = tags[pred_index+1:]
        poss_after = poss[pred_index+1:]
        ents_after = ents[pred_index+1:]

        if 'PRP' not in tags_before and 'PRP$' not in tags_before and 'PERSON' not in ents_before and 'NOUN' in poss_before:
          if 'PRP' not in tags_after and 'PRP$' not in tags_after and 'PERSON' not in ents_after and 'NOUN' in poss_after:
            if "?" in sent:
              if " how " in sent or "when " in sent or "why " in sent:
                return True, 'claim_question', sent
              else:
                return True, 'question', sent
            else:
              return True, pred, sent

  return False, "", ""

is_claim('Hello its me. Twitter causes deafness')

(True, 'causes', 'twitter causes deafness')

In [5]:
res = tweets['text'].progress_apply(is_claim)
res = list(map(list, zip(*res.values)))

tweets['is_claim'] = res[0]
tweets['claim_pred'] = res[1]
tweets['claim_sentence'] = res[2]

100%|██████████| 100000/100000 [00:24<00:00, 4097.39it/s]


In [6]:
print(tweets[tweets['is_claim']]['claim_sentence'])

2173                          boredom leads to desperation
2583     so very proud of the canberra community that c...
2692     learn d laws of the spirit and d fierce winds ...
7091     @bdayspring would hate for u to stop political...
7250              "@shaado_9: toothaches are a bitch ☹ .."
                               ...                        
98458      yes amen a superior thread berets are a godsend
98504    @realdonaldtrump stop trying to kill the ameri...
99146    this 🍊🤡 continues to set the bar lower & lower...
99655    delusional to think four seasons would come to...
99900    touchdown‼️‼️ @maxgilliam11 finds @steve4six f...
Name: claim_sentence, Length: 280, dtype: object
