In [2]:
import re
import spacy
import pandas as pd
from trieregex import TrieRegEx as TRE
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load("en_core_web_sm")
pd.set_option('display.max_rows', None) 



# Patterns definition

In [3]:
 
# create a SpaCy matcher
matcher = Matcher(nlp.vocab)

pattern =  [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}, 
           {"TEXT": {"REGEX": "\\bto\\b|\\bexplains?\\b|\\bmay\\b|\\bmight\\b|\\bcan\\b|\\bcould\\b|\\bfor\\b"}},
           {"POS": "ADV", 'OP': '*'},
           {"POS": "VERB"},
        {"TEXT": {"REGEX": "\\bto\\b|\\bexplains?\\b|\\bmay\\b|\\bmight\\b|\\bcan\\b|\\bcould\\b|\\bfor\\b"}, 'OP': '?'},
           {'POS': 'DET', 'OP': '?', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
            {"DEP" : "poss", 'OP': '*'},
            {"DEP" : "amod", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {"DEP" : "compound", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {'DEP': {"IN":["dobj", "pobj"]}, 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that", "those", "these"]}}
           ]

pattern2 =  [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}, 
            {"TEXT": {"REGEX": "\\bknows\\b|\\bhow\\b|\\bcan\\b"}, 'OP': '?'},
           {"TEXT": {"REGEX": "(\\bis\\b|\\bbe\\b|\\bcan\\b|\\bhow\\b|\\bto\\b|\\btried\\b|\\btry\\b|\\btries\\b)"}},
           {"TEXT": {"REGEX": "(\\bcapable\\b|\\bsupport\\b|\\bhelp\\b|\\bto\\b|\\bbe\\b|\\bable\\b|\\bused\\b)"}},
            {"TEXT": {"REGEX": "(\\bin\\b|\\bat\\b|\\bof\\b|\\bto\\b|\\bfor\\b)"}, 'OP': '?'},
           {"POS": "ADV", 'OP': '*'},
           {"POS": "VERB"},
    {"TEXT": {"REGEX": "\\bto\\b|\\bexplains?\\b|\\bmay\\b|\\bmight\\b|\\bcan\\b|\\bcould\\b|\\bfor\\b|\\btried\\b|\\btry\\b"}, 'OP': '?'},
           {'POS': 'DET', 'OP': '?', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {"DEP" : "poss", 'OP': '*'},
            {"DEP" : "amod", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {"DEP" : "compound", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {'DEP': {"IN":["dobj", "pobj"]}, 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that", "those", "these"]}}
           ]

pattern3 =  [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}, 
           {"POS": "ADV", 'OP': '*'},
           {"POS": "VERB", "LEMMA":{"NOT_IN":["be","have"]}},
        {"TEXT": {"REGEX": "\\bto\\b|\\bexplains?\\b|\\bmay\\b|\\bmight\\b|\\bcan\\b|\\bcould\\b|\\bfor\\b"}, 'OP': '?'},
           {'POS': 'DET', 'OP': '?', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
            {"DEP" : "poss", 'OP': '*'},
            {"DEP" : "amod", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {"DEP" : "compound", 'OP': '*', 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that","those", "these"]}},
           {'DEP': {"IN":["dobj", "pobj"]}, 'TEXT': {'NOT_IN': ['him','her','it', 'me', "this", "you", "us", "them", "that", "those", "these"]}}
           ]

In [4]:
matcher.add("action", [pattern], greedy = "LONGEST")
matcher.add("action2", [pattern2], greedy = "LONGEST")
matcher.add("action3", [pattern3], greedy = "LONGEST")
# definition of a function to call the matcher on a text
def action(text):
    
    if type(text) != str:
        return ""
    
    # the text is transformed into a SpaCy doc
    doc = nlp(text)
    # the matcher is executed
    matches = matcher(doc)
    
    # Inizializate action text
    action = ""

    # The output of the matcher is an object from which we need to extract the strings corresponding to the
    # matched text
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        action = action + "; " + span.text

    output = action[2:]
    return output

# Dataset import


In [5]:
#df = pd.read_excel(r'D:\Dataset\golden_set_results.xlsx')
df = pd.read_csv(r'D:\Dataset\chatgpt_tweets_24_jan_full.csv', encoding = "latin1")
df["text"] = df["text"].str.lower()
df = df[df["lang"]=="en"]
#df = df.sample(1000)
df = df.reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625167 entries, 0 to 625166
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     625167 non-null  int64 
 1   tweet_id       625167 non-null  int64 
 2   text           625167 non-null  object
 3   created_at     625167 non-null  object
 4   lang           625167 non-null  object
 5   user_username  625167 non-null  object
dtypes: int64(2), object(4)
memory usage: 28.6+ MB


### Eliminate Emojis and collapse multiple spaces

In [6]:

df["original_text"] = df["text"]

# Define a regular expression pattern to match emoticons
emoticon_pattern = r'<u\+[a-zA-Z0-9]+>'

# Replace the emoticons from the text column with a space
df["text"] = df["text"].apply(lambda x: re.sub(emoticon_pattern, " ", x))

# Replace multiple adjacent spaces with a single space
df["text"] = df["text"].apply(lambda x: re.sub(' +', ' ', x))


In [7]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_id,text,created_at,lang,user_username,original_text
0,1,1,@chatgpt_erc_bot #image go bolsonaro and down ...,2022-12-21T00:04:03.000Z,en,gembry,@chatgpt_erc_bot #image go bolsonaro and down ...
1,2,2,@200ok must have realised it was going to take...,2022-12-21T00:03:53.000Z,en,madwebskills,@200ok must have realised it was going to take...
2,3,3,"@mikestottwp ok chat gpt wins, this is a bette...",2022-12-21T00:03:35.000Z,en,scottbolinger,"@mikestottwp ok chat gpt wins, this is a bette..."
3,4,4,@maybettl @ahmetb good application for chatgpt,2022-12-21T00:03:29.000Z,en,cwkoss,@maybettl @ahmetb good application for chatgpt
4,5,5,@allesistkode same here. i'm just testing the ...,2022-12-21T00:03:21.000Z,en,ravinsharma7,@allesistkode same here. i'm just testing the ...
5,6,6,the brilliance and weirdness of chatgpt\r\n\r\...,2022-12-21T00:03:09.000Z,en,itsrohitchouhan,the brilliance and weirdness of chatgpt\r\n\r\...
6,7,7,seems like learning from human feedback is bec...,2022-12-21T00:03:08.000Z,en,mc_dropout,seems like learning from human feedback is bec...
7,8,8,@impeedz @bmeiselas @bhaviklathia dude those g...,2022-12-21T00:03:03.000Z,en,AlGoody421,@impeedz @bmeiselas @bhaviklathia dude those g...
8,9,9,"when talking to #chatgpt, this is the mental i...",2022-12-21T00:03:03.000Z,en,Popbones,"when talking to #chatgpt, this is the mental i..."
9,10,10,"extrahop senior technical manager, jamie moles...",2022-12-21T00:03:00.000Z,en,ExtraHop,"extrahop senior technical manager, jamie moles..."


# Extraction

In [8]:
df['match'] = df.apply(lambda x: action(x['text']), axis = 1)
df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_id,text,created_at,lang,user_username,original_text,match
0,1,1,@chatgpt_erc_bot #image go bolsonaro and down ...,2022-12-21T00:04:03.000Z,en,gembry,@chatgpt_erc_bot #image go bolsonaro and down ...,
1,2,2,@200ok must have realised it was going to take...,2022-12-21T00:03:53.000Z,en,madwebskills,@200ok must have realised it was going to take...,
2,3,3,"@mikestottwp ok chat gpt wins, this is a bette...",2022-12-21T00:03:35.000Z,en,scottbolinger,"@mikestottwp ok chat gpt wins, this is a bette...",
3,4,4,@maybettl @ahmetb good application for chatgpt,2022-12-21T00:03:29.000Z,en,cwkoss,@maybettl @ahmetb good application for chatgpt,
4,5,5,@allesistkode same here. i'm just testing the ...,2022-12-21T00:03:21.000Z,en,ravinsharma7,@allesistkode same here. i'm just testing the ...,
5,6,6,the brilliance and weirdness of chatgpt\r\n\r\...,2022-12-21T00:03:09.000Z,en,itsrohitchouhan,the brilliance and weirdness of chatgpt\r\n\r\...,
6,7,7,seems like learning from human feedback is bec...,2022-12-21T00:03:08.000Z,en,mc_dropout,seems like learning from human feedback is bec...,
7,8,8,@impeedz @bmeiselas @bhaviklathia dude those g...,2022-12-21T00:03:03.000Z,en,AlGoody421,@impeedz @bmeiselas @bhaviklathia dude those g...,
8,9,9,"when talking to #chatgpt, this is the mental i...",2022-12-21T00:03:03.000Z,en,Popbones,"when talking to #chatgpt, this is the mental i...",
9,10,10,"extrahop senior technical manager, jamie moles...",2022-12-21T00:03:00.000Z,en,ExtraHop,"extrahop senior technical manager, jamie moles...",


# Print the list of matched entities

In [9]:
#for i in df['match']:
    #if i != "":
        #print (i)

# Creation of Tidy Dataset

In [33]:
df_id = []
df_match = []

tweets_without_match = 0
tweets_with_match = 0



for i in range(0, len(df['match'])):
    if df['match'][i] == "":
        tweets_without_match += 1
    else:  
        tweets_with_match += 1
        stripped = df['match'][i].strip()
        if "; " in stripped:
            # in these cases, the string is splitted
            x = stripped.split("; ")
            for j in x:
                df_match.append(j)
                df_id.append(df['tweet_id'][i])
        else:
            df_match.append(stripped)
            df_id.append(df['tweet_id'][i])
            
tidy = pd.DataFrame({"tweet_id":df_id, "match":df_match})
tidy.head()

Unnamed: 0,tweet_id,match
0,14,it to write a book
1,43,it cant pipette
2,47,chatgpt to optimize your content strategy
3,49,it takes skill
4,51,chatgpt to optimize your content strategy


## Splitting Verbs and Clues

In [46]:
matcher_clue = Matcher(nlp.vocab)
pattern_clue1 = [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}, 
           {"TEXT": {"REGEX": "\\bto\\b|\\bexplains?\\b|\\bmay\\b|\\bmight\\b|\\bcan\\b|\\bcould\\b|\\bfor\\b"}}
]

pattern_clue2 = [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}, 
           {"TEXT": {"REGEX": "\\bknows\\b|\\bhow\\b|\\bcan\\b"}, 'OP': '?'},
           {"TEXT": {"REGEX": "(\\bis\\b|\\bbe\\b|\\bcan\\b|\\bhow\\b|\\bto\\b|\\btried\\b|\\btry\\b|\\btries\\b)"}},
           {"TEXT": {"REGEX": "(\\bcapable\\b|\\bsupport\\b|\\bhelp\\b|\\bto\\b|\\bbe\\b|\\bable\\b|\\bused\\b)"}},
           {"TEXT": {"REGEX": "(\\bin\\b|\\bat\\b|\\bof\\b|\\bto\\b|\\bfor\\b)"}, 'OP': '?'},
           ]

pattern_clue3 = [
           {"TEXT": {"REGEX": "\\b\\(?#?gpt-?3?\\)?\\b|\\b\\(?#?chat-?gpt3?-?\\)?\\b|\\bai\\b|\\bartificial\\b|\\bchat\\b|\\bit\\b"}}, 
           {"TEXT": {"REGEX": "\\bgpt\\b|\\bintelligence\\b|\\b3\\b"}, 'OP': '?'}
]

matcher_clue.add("clue", [pattern_clue1, pattern_clue2, pattern_clue3], greedy = "LONGEST")

df_clues = []
for i in df_match:
    good_str = i #re.sub('_',' ', i)
    #good_str = re.sub(' +', ' ', good_str)
    doc = nlp(good_str)
    matches = matcher_clue(doc)
    action = ""
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        action = action + "; " + span.text
        break
    clue = action[2:]
    #print(clue)
    df_clues.append(clue)

tidy["clues"]=df_clues
tidy.head(10)

Unnamed: 0,tweet_id,match,clues,actions,check_verb
0,14,it to write a book,it to,write a book,1
1,43,it cant pipette,it cant,pipette,1
2,47,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
3,49,it takes skill,it,skill,1
4,51,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
5,57,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
6,60,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
7,62,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
8,71,it hurts students,it,students,1
9,76,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1


In [47]:
df_verbs = []

def remove_non_ascii(s):
    return "".join(c for c in s if ord(c)<128)

for i in range(0,len(tidy["clues"])):
    verb = tidy["match"][i].replace(tidy["clues"][i]+" ","")
    verb = remove_non_ascii(verb)
    verb = re.sub('_',' ', verb)
    verb = re.sub(' +', ' ', verb)
    df_verbs.append(verb)
tidy["actions"] = df_verbs
tidy.head(10)

Unnamed: 0,tweet_id,match,clues,actions,check_verb
0,14,it to write a book,it to,write a book,1
1,43,it cant pipette,it cant,pipette,1
2,47,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
3,49,it takes skill,it,takes skill,1
4,51,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
5,57,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
6,60,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
7,62,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
8,71,it hurts students,it,hurts students,1
9,76,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1


## Verbs Cleaning 

In [48]:
import nltk
#nltk.download('words')
from nltk.corpus import words

word_list = words.words()

In [49]:
check_verb = []

for i in tidy["actions"]:
    if (re.findall(r'[^\w ]',i) != []):
        save = 0
    else:
        save = 1
    check_verb.append(save)
    

print(len(check_verb))
tidy["check_verb"] = check_verb
#tidy

91739


In [51]:
tidy.head()

Unnamed: 0,tweet_id,match,clues,actions,check_verb
0,14,it to write a book,it to,write a book,1
1,43,it cant pipette,it cant,pipette,1
2,47,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
3,49,it takes skill,it,takes skill,1
4,51,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1


In [52]:
tidy_final = tidy.rename(columns={"clues":"clue", "actions":"action"})
tidy_final.head()

Unnamed: 0,tweet_id,match,clue,action,check_verb
0,14,it to write a book,it to,write a book,1
1,43,it cant pipette,it cant,pipette,1
2,47,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1
3,49,it takes skill,it,takes skill,1
4,51,chatgpt to optimize your content strategy,chatgpt to,optimize your content strategy,1


In [54]:
tidy_final.to_csv("D:\\Dataset\\final_extraction.csv")

In [16]:
#df2 = df.merge(tidy, on="tweet_id", how="left")