In [133]:
import os
import openai
import pandas as pd 
from tqdm import tqdm
import amrlib
from amrlib.graph_processing.amr_plot import AMRPlot
import numpy as np
from sklearn.model_selection import train_test_split

openai.api_key = ''


In [24]:
df = pd.read_csv('data/tweet_data_2000.csv')
df


Unnamed: 0,OriginalTweet,Sentiment
0,@esc_myriam Waiting for #Coronavirus EI to kic...,Neutral
1,The @AdColony survey reveals consumer preferen...,Neutral
2,COVID-19: Government should promote online sho...,Neutral
3,The Âkey workersÂ whose children can remain ...,Neutral
4,#VegPower esp as the Asian shops are raising t...,Neutral
...,...,...
1995,While shopping online or making trips to the g...,Negative
1996,If you canÂt find any chicken or ground beef ...,Negative
1997,DriverÂs access to critical services has been...,Negative
1998,iPhone 11 prices slashed by Chinese online ret...,Negative


## Grammar Correction using GPT-3 (Davinci Engine)

In [25]:
processed = []
for text in tqdm(df['OriginalTweet']):
    
    response = openai.Completion.create(
      engine="text-davinci-002",
      prompt="Correct this to standard English with one sentence: "+text,
      temperature=0,
      max_tokens=60,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    
    processed.append(response['choices'][0]['text'])

100%|█████████████████████████████████████| 2000/2000 [1:02:41<00:00,  1.88s/it]


In [26]:
df['processed tweet'] = processed

In [38]:
df['processed tweet'] = [t.replace('\n', '').replace('\r', '') for t in df['processed tweet']]

In [63]:
df['processed tweet'] = [re.sub(r'pic.twitter.com/[\w]*','', t) for t in df['processed tweet']]

In [61]:
df.to_csv('cleaned_tweet_data_2000.csv', index = False)

In [62]:
df

Unnamed: 0,OriginalTweet,Sentiment,processed tweet
0,@esc_myriam Waiting for #Coronavirus EI to kic...,Neutral,I am waiting for my EI to kick in so that I ca...
1,The @AdColony survey reveals consumer preferen...,Neutral,The AdColony survey reveals consumer preferenc...
2,COVID-19: Government should promote online sho...,Neutral,The government should promote online shopping ...
3,The Âkey workersÂ whose children can remain ...,Neutral,"The children of ""key workers"" can remain at sc..."
4,#VegPower esp as the Asian shops are raising t...,Neutral,"is important, especially as the prices of hal..."
...,...,...,...
1995,While shopping online or making trips to the g...,Negative,safety is still important when shopping onli...
1996,If you canÂt find any chicken or ground beef ...,Negative,
1997,DriverÂs access to critical services has been...,Negative,The driver's access to critical services has b...
1998,iPhone 11 prices slashed by Chinese online ret...,Negative,Chinese online retailers are slashing iPhone 1...


## AMR Representation

In [None]:
stog = amrlib.load_stog_model()

In [104]:
AMR_graphs = []
for text in tqdm(df['processed tweet']):
    AMR_graphs.append(stog.parse_sents([text], add_metadata = False)[0])

  0%|                                        | 1/2000 [00:12<6:45:41, 12.18s/it]gid=x Start paren present but organization is not a new concept
  4%|█▍                                     | 73/2000 [13:04<6:34:16, 12.28s/it]gid=x Start paren present but i is not a new concept
  4%|█▍                                     | 76/2000 [13:44<5:53:56, 11.04s/it]gid=x Start paren present but update-02 is not a new concept
  4%|█▌                                     | 77/2000 [13:57<6:12:47, 11.63s/it]gid=x Start paren present but company is not a new concept
gid=x Start paren present but name is not a new concept
ignoring epigraph data for duplicate triple: ('c', ':name', 'n')
  4%|█▋                                     | 84/2000 [14:57<4:27:57,  8.39s/it]gid=x Start paren present but name is not a new concept
  4%|█▋                                     | 87/2000 [15:23<4:32:13,  8.54s/it]gid=x Start paren present but name is not a new concept
  5%|█▉                                    | 105/2

In [116]:
df['AMR'] = AMR_graphs

In [117]:
df.to_csv('ProcessedTweet_AMR_tweet_data_2000.csv', index = False)

In [118]:
shuffled_df = df.sample(frac=1)

In [119]:
shuffled_df.to_csv('AMR_ProcessedTweet_data_2000.csv', index = False)

In [120]:
shuffled_df

Unnamed: 0,OriginalTweet,Sentiment,processed tweet,AMR
356,Trump dismisses question on oil prices when re...,Neutral,tweetTrump dismisses question on oil prices w...,(d / dismiss-01\n :ARG0 (p / person\n ...
1568,Are nhs staff able to buy food due to panic bu...,Negative,Can NHS staff buy food due to panic buying?,(p / possible-01\n :ARG1 (b / buy-01\n ...
466,Stocking up: Due to the limitations from #coro...,Neutral,"Amazon announced plans to add 100,000 full-tim...",(a / announce-01\n :ARG0 (c / company\n ...
1817,Sucks that food shelves in Minnesota can't hel...,Negative,It's a shame that food shelves in Minnesota ca...,(o / or\n :op1 (s / shame\n :d...
1451,US senators are under scrutiny over claims the...,Negative,US senators are under scrutiny for allegedly u...,(s / scrutinize-01\n :ARG0 (g / governmen...
...,...,...,...,...
958,Why not lower the PPV prices and donate some o...,Positive,Why not lower the PPV prices and donate some o...,(c / cause-01\n :ARG0 (a / amr-unknown)\n...
712,Have your summer festival plans been cancelled...,Positive,If your summer festival plans have been cancel...,(r / read-01\n :mode imperative\n :A...
349,Do you know what I can do with pinto beans and...,Neutral,What can I do with pinto beans and lettuce?,(d / do-02\n :ARG0 (ii / i)\n :ARG1 ...
1955,"#Factories that used to make perfume, T-shirts...",Negative,"Factories that used to make perfume, T-shirts,...",(m / make-01\n :ARG0 (f / factory\n ...


### Store Results

In [121]:
shuffled_df[['Sentiment']].to_csv('data/ProcessedTweet_label.csv', index=False)

In [122]:
shuffled_df[['processed tweet']].to_csv('data/ProcessedTweet_tweets.csv', index=False)

In [123]:
shuffled_df[['AMR']].to_csv('data/ProcessedTweet_AMR.csv', index=False)

In [129]:
print(shuffled_df['AMR'][41])

(m / multi-sentence
      :snt1 (a / apply-02
            :ARG1 (s / surcharge
                  :prep-for (p / product
                        :name (n / name
                              :op1 "Covid-19")))
            :ARG2 (s2 / serve-01
                  :ARG1 (f / freight
                        :mod (p2 / parcel))
                  :ARG1-of (o / offer-01
                        :ARG0 (a2 / airline
                              :mod (c / cargo)))
                  :mod (a3 / all)))
      :snt2 (c2 / contact-01
            :polite +
            :mode imperative
            :ARG0 (y / you)
            :ARG1 (e / email-address-entity
                  :value "team@freightww.com")
            :purpose (p3 / price-01
                  :ARG1 (c3 / cargo
                        :name (n2 / name
                              :op1 "AirCargoStrong")))))


In [130]:
print(shuffled_df['processed tweet'][41])

A surcharge for Covid-19 has been applied to all parcel freight services offered by cargo airlines. For AirCargoStrong prices, please contact team@freightww.


In [139]:
y = shuffled_df[['Sentiment']]

X_train, X_test, y_train, y_test = train_test_split(shuffled_df, y, test_size=0.2)


In [148]:
X_train.to_csv('data/train.csv', index = False)
X_test.to_csv('data/test.csv', index = False)