In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.random.set_seed(42)
np.random.seed(42)

#from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
twt_train = pd.read_csv('twitter_training.csv', header=None)
twt_train.columns = ["TweetID", "Topic", "Sentiment", "Message"]
twt_train["Message"] = twt_train["Message"].astype(str)

twt_valid = pd.read_csv('twitter_validation.csv', header=None)
twt_valid.columns = ["TweetID", "Topic", "Sentiment", "Message"]
twt_valid["Message"] = twt_valid["Message"].astype(str)

twt_train

Unnamed: 0,TweetID,Topic,Sentiment,Message
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
twt_train = twt_train[twt_train["Sentiment"] != "Irrelevant"]
twt_valid = twt_valid[twt_valid["Sentiment"] != "Irrelevant"]

replace_map = {'Negative' : 0,
               'Positive' : 1,
               'Neutral' : 2,}

twt_train.Sentiment = twt_train.Sentiment.map(replace_map)
twt_valid.Sentiment = twt_valid.Sentiment.map(replace_map)

twt_train.Sentiment.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twt_train.Sentiment = twt_train.Sentiment.map(replace_map)


0    22542
1    20832
2    18318
Name: Sentiment, dtype: int64

In [4]:
X_train = twt_train['Message']
y_train = twt_train['Sentiment']
X_valid = twt_valid['Message']
y_valid = twt_valid['Sentiment']

In [5]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

X_train = X_train.apply(clean_text)
X_valid = X_valid.apply(clean_text)

X_train

0        im getting on borderlands and i will murder yo...
1        i am coming to the borders and i will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands   and i will murder ...
                               ...                        
74677    just realized that the windows partition of my...
74678    just realized that my mac window partition is ...
74679    just realized the windows partition of my mac ...
74680    just realized between the windows partition of...
74681    just like the windows partition of my mac is l...
Name: Message, Length: 61692, dtype: object

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jrkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jrkar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jrkar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
def remove_stop_words(text):
    results = []
    stop_words = stopwords.words('english')
    text = text.split()
    text = [results.append(word) for word in text if word not in stop_words]            
    text = ' '.join(results)
    return text

X_train = X_train.apply(remove_stop_words)
X_train

0                            im getting borderlands murder
1                                      coming borders kill
2                              im getting borderlands kill
3                             im coming borderlands murder
4                            im getting borderlands murder
                               ...                        
74677    realized windows partition mac like years behi...
74678    realized mac window partition years behind nvi...
74679    realized windows partition mac years behind nv...
74680    realized windows partition mac like years behi...
74681    like windows partition mac like years behind d...
Name: Message, Length: 61692, dtype: object

In [8]:
def lematize(text):
    text = text.split()
    text = [WordNetLemmatizer().lemmatize(w) for w in text]
    text = ' '.join(text)
    return text

X_train = X_train.apply(lematize)
X_train

0                             im getting borderland murder
1                                       coming border kill
2                               im getting borderland kill
3                              im coming borderland murder
4                             im getting borderland murder
                               ...                        
74677    realized window partition mac like year behind...
74678    realized mac window partition year behind nvid...
74679    realized window partition mac year behind nvid...
74680    realized window partition mac like year behind...
74681    like window partition mac like year behind dri...
Name: Message, Length: 61692, dtype: object

In [9]:
vocab_size = 10000

X_train = [one_hot(d, vocab_size, split=' ') for d in X_train]
X_valid = [one_hot(d, vocab_size, split=' ') for d in X_valid]

X_train
max(len(x) for x in X_train)

163

In [10]:
max_length = 170
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_valid = pad_sequences(X_valid, maxlen=max_length, padding='post')

X_train

array([[1344,  260, 8667, ...,    0,    0,    0],
       [8246, 4175, 4131, ...,    0,    0,    0],
       [1344,  260, 8667, ...,    0,    0,    0],
       ...,
       [3986, 3995, 7648, ...,    0,    0,    0],
       [3986, 3995, 7648, ...,    0,    0,    0],
       [8998, 3995, 7648, ...,    0,    0,    0]])

In [11]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, 8, input_length=max_length),
    
    keras.layers.Conv1D(128, 3),
    #keras.layers.MaxPooling1D(),
    keras.layers.LeakyReLU(),
    #keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(256, 3),
    keras.layers.MaxPooling1D(),
    keras.layers.LeakyReLU(),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(256, 3),
    keras.layers.LeakyReLU(),
    keras.layers.MaxPooling1D(),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(128, 3),
    keras.layers.LeakyReLU(),
    keras.layers.MaxPooling1D(),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(64, 3),
    keras.layers.LeakyReLU(),
    keras.layers.MaxPooling1D(),
    keras.layers.BatchNormalization(),
    
    keras.layers.Flatten(),
    keras.layers.Dense(64),
    keras.layers.LeakyReLU(),
    
    keras.layers.Dense(32),
    keras.layers.LeakyReLU(),
    
    keras.layers.Dense(3),
    keras.layers.
])

In [12]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.9),
              metrics=["accuracy"])

  super().__init__(name, **kwargs)


In [13]:
history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid))

Epoch 1/10
  27/1928 [..............................] - ETA: 8:41 - loss: 2.1435 - accuracy: 0.3079

KeyboardInterrupt: 