In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras_preprocessing.text import Tokenizer
from string import punctuation
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


from google.colab import files

x_train = []
y_train = []

df = pd.read_csv('/content/drive/MyDrive/*path_to_dataset*/datasets/tweets_train.csv', keep_default_na=False)
df = df[['clean_text','Sentiment']]

df.head()

one_hot = pd.get_dummies(df["Sentiment"])
df.drop(['Sentiment'],axis=1,inplace=True)
df = pd.concat([df,one_hot],axis=1)

print(df.head())

def get_text_processing(text):
    stpword = stopwords.words('english')
    no_punctuation = [char for char in text if char not in punctuation]
    no_punctuation = ''.join(no_punctuation)
    return ' '.join([word for word in no_punctuation.split() if word.lower() not in stpword])

df['clean_text'] = df['clean_text'].apply(get_text_processing)
df.head()


X = df['clean_text'].values
y = df.drop('clean_text', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index=tokenizer.word_index
vocab_size = len(word_index)+1

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train_pad = pad_sequences(X_train, padding='pre', maxlen=maxlen)
X_test_pad = pad_sequences(X_test, padding='pre', maxlen=maxlen)

# create rnn model multilabel tweet classification
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(X_train_pad), 64),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

early_stop = EarlyStopping(monitor='accuracy', mode='min', verbose=1, patience=2)

model.fit(X_train_pad, y_train, epochs=10, batch_size=128, callbacks=early_stop)

model.evaluate(X_test_pad, y_test,)

df = pd.read_csv('/content/drive/MyDrive/*path_to_dataset*/datasets/tweets_test.csv', keep_default_na=False)
df = df[['clean_text']]

df['clean_text'] = df['clean_text'].apply(get_text_processing)

X_pred = df['clean_text'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_pred)
word_index=tokenizer.word_index
vocab_size = len(word_index)+1

X_pred = tokenizer.texts_to_sequences(X_pred)

maxlen = 100
X_pred = pad_sequences(X_pred, padding='pre', maxlen=maxlen)

y_prob = model.predict(X_pred)

y_classes = y_prob.argmax(axis=-1)

y_classes

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                          clean_text  Negative  Neutral  \
0  TheSocialDilemma is an eye opener isn t it ple...         0        1   
1  TheSocialDilemma If we don t agree on what is ...         0        0   
2  Watching TheSocialDilemma scary to see social ...         1        0   
3  You check your social media before you pee in ...         0        0   
4  watch thesocialdilemma and see what s actually...         1        0   

   Positive  
0         0  
1         1  
2         0  
3         1  
4         0  
10386
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 3: early stopping


array([2, 2, 2, ..., 0, 1, 1])