In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.layers import Dense,GRU, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

import pickle
import warnings
warnings.filterwarnings("ignore")

In [32]:
df = pd.read_csv(r"./data/sentiment_analysis_results.csv")
df

Unnamed: 0,id,text,sentiment_label,sentiment_score,text_length
0,Dove_final_0,Tuckman's stages of group development is a wid...,neutral,0.925602,2623
1,Dove_final_1,"[Cave Johnson speaking]\n\nFolks, Cave Johnson...",neutral,0.436604,2393
2,Dove_final_2,Given a band-limited signal x(t)x(t) and an id...,neutral,0.843796,1801
3,Dove_final_2,"As we increase the sampling frequency fsfs, th...",neutral,0.829435,1171
4,Dove_final_3,The success of a chief underwriter within an i...,neutral,0.817200,1344
...,...,...,...,...,...
7116,Dove_final_3853,"If the sun were the size of an orange, which i...",neutral,0.759380,310
7117,Dove_final_3854,If the air disappeared from Earth for just 5 s...,negative,0.607713,1592
7118,Dove_final_3855,"Once upon a time, in a quaint little town, the...",positive,0.793843,3439
7119,Dove_final_3856,A subquery in a SELECT statement is a query th...,neutral,0.886471,1352


In [33]:
df["sentiment_label"].value_counts()

sentiment_label
neutral     5332
positive    1496
negative     293
Name: count, dtype: int64

In [34]:
label_encoder = LabelEncoder()
df["label_encoder"] = label_encoder.fit_transform(df["sentiment_label"])

In [35]:
df

Unnamed: 0,id,text,sentiment_label,sentiment_score,text_length,label_encoder
0,Dove_final_0,Tuckman's stages of group development is a wid...,neutral,0.925602,2623,1
1,Dove_final_1,"[Cave Johnson speaking]\n\nFolks, Cave Johnson...",neutral,0.436604,2393,1
2,Dove_final_2,Given a band-limited signal x(t)x(t) and an id...,neutral,0.843796,1801,1
3,Dove_final_2,"As we increase the sampling frequency fsfs, th...",neutral,0.829435,1171,1
4,Dove_final_3,The success of a chief underwriter within an i...,neutral,0.817200,1344,1
...,...,...,...,...,...,...
7116,Dove_final_3853,"If the sun were the size of an orange, which i...",neutral,0.759380,310,1
7117,Dove_final_3854,If the air disappeared from Earth for just 5 s...,negative,0.607713,1592,0
7118,Dove_final_3855,"Once upon a time, in a quaint little town, the...",positive,0.793843,3439,2
7119,Dove_final_3856,A subquery in a SELECT statement is a query th...,neutral,0.886471,1352,1


In [36]:
df["label_encoder"].value_counts()

label_encoder
1    5332
2    1496
0     293
Name: count, dtype: int64

In [37]:
vocab_size= 8000
max_len = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<oov>")
tokenizer.fit_on_texts(df["text"])
sequence = tokenizer.texts_to_sequences(df["text"])

x = pad_sequences( sequence, maxlen=max_len, padding="post")
y = np.array(df["label_encoder"])
x

array([[ 316,    2,  109, ...,    6,  246,  198],
       [   5, 1949, 4090, ..., 4015, 3869,  129],
       [  47,  124,   10, ...,  992,  844, 7569],
       ...,
       [   2, 1432, 2129, ...,    2,  330, 1760],
       [  20,  202, 1389, ...,    7,   28, 3367],
       [4389,   24,  185, ...,    2,   92, 2422]], dtype=int32)

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, train_size=0.8, stratify=y)

In [39]:
y_train

array([1, 1, 1, ..., 2, 1, 1])

In [40]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=64,input_length= max_len))
model.add(GRU(64))
model.add(Dense(units=3, activation="softmax"))

model.compile(optimizer="adam", metrics=["accuracy"], loss="sparse_categorical_crossentropy")
model.fit(x_train, y_train, validation_data=(x_test,y_test), epochs=20, batch_size=64,verbose=2)

Epoch 1/20
89/89 - 6s - 66ms/step - accuracy: 0.7437 - loss: 0.7392 - val_accuracy: 0.7488 - val_loss: 0.6564
Epoch 2/20
89/89 - 4s - 41ms/step - accuracy: 0.7567 - loss: 0.5955 - val_accuracy: 0.7656 - val_loss: 0.5975
Epoch 3/20
89/89 - 4s - 41ms/step - accuracy: 0.8172 - loss: 0.4704 - val_accuracy: 0.7572 - val_loss: 0.6195
Epoch 4/20
89/89 - 4s - 42ms/step - accuracy: 0.8536 - loss: 0.3780 - val_accuracy: 0.7635 - val_loss: 0.6559
Epoch 5/20
89/89 - 4s - 41ms/step - accuracy: 0.8773 - loss: 0.3172 - val_accuracy: 0.7565 - val_loss: 0.7324
Epoch 6/20
89/89 - 4s - 42ms/step - accuracy: 0.8834 - loss: 0.2772 - val_accuracy: 0.7621 - val_loss: 0.7669
Epoch 7/20
89/89 - 4s - 41ms/step - accuracy: 0.8969 - loss: 0.2349 - val_accuracy: 0.7677 - val_loss: 0.8030
Epoch 8/20
89/89 - 4s - 42ms/step - accuracy: 0.9127 - loss: 0.1987 - val_accuracy: 0.8007 - val_loss: 0.8569
Epoch 9/20
89/89 - 4s - 42ms/step - accuracy: 0.9345 - loss: 0.1606 - val_accuracy: 0.8000 - val_loss: 0.9111
Epoch 10/2

<keras.src.callbacks.history.History at 0x1b9beb1ee10>

In [41]:
loss, accuracy =  model.evaluate(x_train, y_train)

[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9968 - loss: 0.0093


In [42]:
loss, accuracy = model.evaluate(x_test,y_test)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8028 - loss: 1.4964


In [44]:
model.save("GRU_Sentiment.keras")

In [59]:
with open("RNN_tokenizer.pkl","wb") as f:
    pickle.dump(tokenizer,f)

with open("label_encoder.pkl","wb") as f:
    pickle.dump(label_encoder,f)

with open("RNN_max_len.txt","w") as f:
    f.write(str(max_len))

In [58]:
input_text = '''III.C: The potential for AIGC to revolutionize digital identity solutions

In this section, the essay explores the transformative potential of AI-Generated Content (AIGC) in the field of digital identity solutions. As AIGC becomes more advanced and integrated into various industries, it has the capacity to significantly enhance and streamline digital identity verification processes. By leveraging the power of AI, AIGC can enable the development of new tools and methods that improve efficiency, security, and user experience in digital identity management, ultimately shaping the future of identity verification in an increasingly connected world.'''

seq = tokenizer.texts_to_sequences([input_text])
pad_seq = pad_sequences(seq, padding="post", maxlen=max_len)

pred_prob = model.predict(pad_seq)
pred_class =np.argmax(pred_prob,axis=1)[0]
label_encoder.inverse_transform([pred_class])[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'positive'

In [43]:
### "categorical_crossentropy" - use when one hot encoding used y_train ->y_train = [
  #[1,0,0],
 # [0,1,0],
 # [0,0,1]
#]
### sparse_categorical_crossentropy - use when integer based y_train -> 
#y_train = [0, 2, 1, 0, 1]