In [1]:
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.optimizers import adam_v2
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import numpy as np
import pandas as pd
import pickle

In [2]:
gmo_csv = pd.read_csv('/content/GMO.csv')
gmo_csv

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_verified,date,text,hashtags,source
0,0,Divinity Seven♻️🏥,"Las Vegas, NV","UNLV BS Public Administration, M.Ed, USC MSW, ...",False,2022-08-08 22:40:57+00:00,RT @FDAfood: Consumers and healthcare professi...,"['GMOs', 'FeedYourMind']",Twitter for iPhone
1,1,Marco Pino,Panamá,"Promovamos valores en nuestra sociedad, paz y ...",False,2022-08-08 19:36:54+00:00,RT @FDAfood: Consumers and healthcare professi...,"['GMOs', 'FeedYourMind']",Twitter for Android
2,2,FDA_ORA,,Get the latest information from FDA’s Office o...,True,2022-08-08 19:30:12+00:00,RT @FDAfood: Consumers and healthcare professi...,"['GMOs', 'FeedYourMind']",Twitter for iPhone
3,3,FDA FOOD (Ctr for Food Safety & Applied Nutrit...,"College Park, MD","The latest on food safety, outbreaks, recalls,...",True,2022-08-08 18:51:00+00:00,Consumers and healthcare professionals can now...,"['GMOs', 'FeedYourMind']",Twitter Web App
4,4,Dirt To Dinner,"Connecticut, USA",Providing you with a better understanding of f...,False,2022-08-08 18:45:01+00:00,The message from the mainstream scientific and...,"['CRISPR', 'GMOs']",Sprout Social
...,...,...,...,...,...,...,...,...,...
400,400,sse.ndi,,discombobulated fr,False,2022-08-04 12:40:40+00:00,RT @UgandaBIC: Wheat is 1 of the most importan...,,Twitter for iPhone
401,401,Godfrey Asea,Namulonge,My heart is in crops research for development!,False,2022-08-04 12:39:59+00:00,RT @UgandaBIC: Wheat is 1 of the most importan...,,Twitter for iPhone
402,402,NARO-NaCRRI-Namulonge,"27km, Gayaza-Zirobwe road",Uganda's premier crops resources research & de...,False,2022-08-04 12:39:14+00:00,RT @UgandaBIC: Wheat is 1 of the most importan...,,Twitter for iPhone
403,403,UBIC,NaCRRI-Namulonge,Uganda's premier modern Ag-biosciences informa...,False,2022-08-04 12:36:48+00:00,Wheat is 1 of the most important food crops wo...,,Twitter for iPhone


In [3]:
text_df = pd.DataFrame(gmo_csv['text'])
text_df

Unnamed: 0,text
0,RT @FDAfood: Consumers and healthcare professi...
1,RT @FDAfood: Consumers and healthcare professi...
2,RT @FDAfood: Consumers and healthcare professi...
3,Consumers and healthcare professionals can now...
4,The message from the mainstream scientific and...
...,...
400,RT @UgandaBIC: Wheat is 1 of the most importan...
401,RT @UgandaBIC: Wheat is 1 of the most importan...
402,RT @UgandaBIC: Wheat is 1 of the most importan...
403,Wheat is 1 of the most important food crops wo...


In [10]:
import re
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
def Preprocessing(text):
    text = re.sub(r'[^\w\s]','',text)
    text = text.lower()
    text = [w for w in text.split(' ') if w not in stopwords.words('english')]
    text = [WordNetLemmatizer().lemmatize(token) for token in text]
    text = [WordNetLemmatizer().lemmatize(token,pos='v') for token in text]
    text = " ".join(text)
    return text

In [11]:
text_df['text'].apply(lambda x:Preprocessing(x))

0      rt fdafood consumer healthcare professional fi...
1      rt fdafood consumer healthcare professional fi...
2      rt fdafood consumer healthcare professional fi...
3      consumer healthcare professional find new mate...
4      message mainstream scientific food security co...
                             ...                        
400    rt ugandabic wheat 1 important food crop world...
401    rt ugandabic wheat 1 important food crop world...
402    rt ugandabic wheat 1 important food crop world...
403    wheat 1 important food crop worldwide global w...
404    way reduce nitrogen use maintain crop yield us...
Name: text, Length: 405, dtype: object

In [12]:
def remove_pattern(text,pattern):
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    return text

In [13]:
text_df['text'] = np.vectorize(remove_pattern)(text_df['text'], "@[\w]*")

In [14]:
text_df['text'] = text_df['text'].str.replace("[^a-zA-Z#]", " ")
text_df

  """Entry point for launching an IPython kernel.


Unnamed: 0,text
0,RT Consumers and healthcare professionals ca...
1,RT Consumers and healthcare professionals ca...
2,RT Consumers and healthcare professionals ca...
3,Consumers and healthcare professionals can now...
4,The message from the mainstream scientific and...
...,...
400,RT Wheat is of the most important food cro...
401,RT Wheat is of the most important food cro...
402,RT Wheat is of the most important food cro...
403,Wheat is of the most important food crops wo...


In [15]:
text_df['text'] = text_df['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
text_df

Unnamed: 0,text
0,Consumers healthcare professionals find materi...
1,Consumers healthcare professionals find materi...
2,Consumers healthcare professionals find materi...
3,Consumers healthcare professionals find materi...
4,message from mainstream scientific food securi...
...,...
400,Wheat most important food crops worldwide glob...
401,Wheat most important food crops worldwide glob...
402,Wheat most important food crops worldwide glob...
403,Wheat most important food crops worldwide glob...


In [16]:
tokenized_tweet = text_df['text'].apply(lambda x: x.split())
tokenized_tweet

0      [Consumers, healthcare, professionals, find, m...
1      [Consumers, healthcare, professionals, find, m...
2      [Consumers, healthcare, professionals, find, m...
3      [Consumers, healthcare, professionals, find, m...
4      [message, from, mainstream, scientific, food, ...
                             ...                        
400    [Wheat, most, important, food, crops, worldwid...
401    [Wheat, most, important, food, crops, worldwid...
402    [Wheat, most, important, food, crops, worldwid...
403    [Wheat, most, important, food, crops, worldwid...
404    [reduce, nitrogen, while, maintaining, crop, y...
Name: text, Length: 405, dtype: object

In [17]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

text_df['text'] = tokenized_tweet
text_df

Unnamed: 0,text
0,Consumers healthcare professionals find materi...
1,Consumers healthcare professionals find materi...
2,Consumers healthcare professionals find materi...
3,Consumers healthcare professionals find materi...
4,message from mainstream scientific food securi...
...,...
400,Wheat most important food crops worldwide glob...
401,Wheat most important food crops worldwide glob...
402,Wheat most important food crops worldwide glob...
403,Wheat most important food crops worldwide glob...


In [18]:
words = ' '.join(text for text in text_df['text'])
text = words.lower()

In [19]:
tokenizer = Tokenizer()
corpus = text.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(total_words)
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

{'gmos': 1, 'food': 2, 'https': 3, 'that': 4, 'grown': 5, 'only': 6, 'farmers': 7, 'monsanto': 8, 'organic': 9, 'bayer': 10, 'like': 11, 'treated': 12, 'terrorists': 13, 'shall': 14, 'economy': 15, 'industrial': 16, 'nature': 17, 'have': 18, 'plants': 19, 'green': 20, 'products': 21, 'plantbased': 22, 'life': 23, 'deregulation': 24, 'since': 25, 'sustained': 26, 'leaves': 27, 'fakefood': 28, 'toxic': 29, 'countries': 30, 'with': 31, 'other': 32, 'farming': 33, 'fields': 34, 'feeding': 35, 'than': 36, 'through': 37, 'about': 38, 'world': 39, 'corn': 40, 'breeding': 41, 'truss': 42, 'vows': 43, 'unleash': 44, 'british': 45, 'including': 46, 'precision': 47, 'technologies': 48, 'code': 49, 'what': 50, 'less': 51, 'american': 52, 'directly': 53, 'consumed': 54, 'cereal': 55, 'from': 56, 'they': 57, 'make': 58, 'pesticides': 59, 'label': 60, 'gmo': 61, 'health': 62, 'this': 63, 'gmofree': 64, 'banned': 65, 'study': 66, 'feedyourmind': 67, 'support': 68, 'crops': 69, 'permaculture': 70, 'gen

In [20]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [21]:
# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [26]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = adam_v2.Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

In [27]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5438, 100)         100900    
                                                                 
 bidirectional_1 (Bidirectio  (None, 300)              301200    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1009)              303709    
                                                                 
Total params: 705,809
Trainable params: 705,809
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
history = model.fit(xs, ys, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
print(model)

<keras.engine.sequential.Sequential object at 0x7f8745f51e90>


In [35]:
model.save('GMO_Model.h5')

In [31]:
import pickle

In [34]:
with open('GMO_Model_History.pickle', 'wb') as file_pi:
      pickle.dump(history.history, file_pi)

In [36]:
with open('GMO_Model_History.pickle_backup', 'wb') as file_pi:
      pickle.dump(history, file_pi)



In [37]:
seed_text = "Are genetically"
next_words = 10
#Prediction
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted = predicted.argmax(axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

Are genetically important food crops worldwide global wheat shortage prompted scientists china
