# NLP supervised

## Importing the libraries

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [29]:
dataset = pd.read_csv("text_emotion.csv")
print(dataset)

         tweet_id   sentiment         author  \
0      1956967341       empty     xoshayzers   
1      1956967666     sadness      wannamama   
2      1956967696     sadness      coolfunky   
3      1956967789  enthusiasm    czareaquino   
4      1956968416     neutral      xkilljoyx   
...           ...         ...            ...   
39995  1753918954     neutral  showMe_Heaven   
39996  1753919001        love       drapeaux   
39997  1753919005        love       JenniRox   
39998  1753919043   happiness       ipdaman1   
39999  1753919049        love    Alpharalpha   

                                                 content  
0      @tiffanylue i know  i was listenin to bad habi...  
1      Layin n bed with a headache  ughhhh...waitin o...  
2                    Funeral ceremony...gloomy friday...  
3                   wants to hang out with friends SOON!  
4      @dannycastillo We want to trade with someone w...  
...                                                  ...  
39995     

## Cleaning the text

In [30]:
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\razva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\razva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
print(len(dataset))

40000


In [32]:
corpus = []

for i in range(40000):
  review = re.sub('@[a-zA-z0-9]*',' ',dataset['content'][i])
  review = re.sub('https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}',' ',review)
  if i<5:
    print(review)
  review = re.sub('[^a-zA-Z]', ' ', review)
  review = review.lower()
  review = review.split()

  ps = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  my_stopwords = stopwords.words('english')
  my_stopwords.remove('not')
  my_stopwords.remove('down')
#   review = [ps.stem(word) for word in review if not word in set(my_stopwords)]
  review = [lemmatizer.lemmatize(word) for word in review if not word in set(my_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

  i know  i was listenin to bad habit earlier and i started freakin at his part =[
Layin n bed with a headache  ughhhh...waitin on your call...
Funeral ceremony...gloomy friday...
wants to hang out with friends SOON!
  We want to trade with someone who has Houston tickets, but no one will.


In [33]:
print(corpus[:20])

['know listenin bad habit earlier started freakin part', 'layin n bed headache ughhhh waitin call', 'funeral ceremony gloomy friday', 'want hang friend soon', 'want trade someone houston ticket one', 'pinging go prom bc bf like friend', 'sleep im not thinking old friend want married damn amp want scandalous', 'hmmm down', 'charlene love miss', 'sorry least friday', 'cant fall asleep', 'choked retainer', 'ugh beat stupid song get next rude', 'u watch hill london u realise tourture week week late watch itonlinelol', 'got news', 'storm electricity gone', 'agreed', 'sleepy not even late fail', 'lady gaga tweeted not impressed video leaking know', 'convinced always wanted signal give damn think lost another friend']


## Building a bag of words

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=26300)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

MemoryError: Unable to allocate 7.87 GiB for an array with shape (40000, 26401) and data type int64

In [8]:
print(len(X[0]))

26401


## Some more data preprocessing

In [9]:
print(y[:10])

['empty' 'sadness' 'sadness' 'enthusiasm' 'neutral' 'worry' 'sadness'
 'worry' 'sadness' 'sadness']


## Training the neuronal network

In [10]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

le  = LabelEncoder()
lb = LabelBinarizer() 

lb.fit(y) 
lb.classes_
y_new = lb.transform(y) 
# print(y_new)

y=y_new



In [40]:
lb.classes_

array(['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'], dtype='<U10')

In [11]:
print(y)

[[0 0 1 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Splitting the dataset into the Training and Test set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [13]:
print(len(X_test))
print(len(y_test))

print(y_train[:10])
print(y_test[:10])

8000
8000
[[0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]]


## Building an ANN

In [14]:
import tensorflow as tf

### Initializing 

In [15]:
ann = tf.keras.models.Sequential()

### Adding the first hidden layer

In [16]:
ann.add(tf.keras.layers.Dense(units=120, activation='relu'))

### Adding the second hidden layer

In [17]:
ann.add(tf.keras.layers.Dense(units=120, activation='relu'))

### Adding the output layer

In [18]:
ann.add(tf.keras.layers.Dense(units=13, activation='softmax'))

## Training the ANN

### Compiling the ANN

In [19]:
ann.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Making predictions and evaluating the model

In [20]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d6854d7940>

### Predicting the test results

In [37]:
y_pred = ann.predict(X_test)

print(len(y_pred))

y_pred_labels = []

# print(y_pred[:5])

for i in range(len(y_pred)): 

    # print(sum(y_pred[i]))

    maxim=y_pred[i][0]
    label=0
    for j in range(1,len(y_pred[i])):
        if maxim>y_pred[i][j]:
            maxixm = y_pred[i][j]
            label = j

    y_pred_labels.append(label)

print(y_pred_labels[:30])


8000
[[8.3911594e-08 5.8598982e-07 1.7389566e-04 1.3943470e-05 3.4837256e-05
  3.8228061e-02 1.1456566e-03 6.6894549e-04 4.2339080e-04 1.1664007e-06
  9.5621693e-01 1.3693666e-05 3.0786211e-03]
 [3.7932230e-10 2.5709511e-07 1.1151717e-04 4.2350866e-06 2.1302032e-08
  4.9266532e-07 2.1369598e-07 3.2588228e-05 2.4988689e-02 3.4395481e-07
  8.0305271e-02 3.9249012e-04 8.9416385e-01]
 [2.0480632e-06 1.2164754e-03 2.3979066e-02 3.1938504e-05 1.3862892e-05
  5.8717886e-03 1.6684446e-04 4.8943724e-07 1.2245096e-03 5.9798581e-04
  9.6264267e-01 1.6019627e-05 4.2362823e-03]
 [5.4626807e-12 1.2554825e-06 7.0249024e-09 4.5674287e-08 7.8438062e-11
  6.9484471e-14 3.6961666e-07 2.3469318e-07 1.9228200e-05 8.4613989e-11
  9.9981612e-01 1.4737009e-07 1.6270504e-04]
 [2.2297764e-09 1.3974023e-11 3.9725869e-06 2.9005061e-04 3.2505156e-03
  9.4653291e-01 2.6386661e-07 1.7267054e-04 4.3215973e-06 1.8070044e-07
  9.2904379e-08 4.9731404e-02 1.3672950e-05]]
[0, 0, 7, 5, 1, 1, 1, 1, 0, 0, 0, 10, 9, 1, 7, 0,

### Checking the Confusion Matrix

In [38]:
relabeled_y_test= []

for i in range(len(y_test)): 

    maxim=-1 
    label=-1
    

    for j in range(len(y_test[i])):
        if y_test[i][j] == 1: 
            relabeled_y_test.append(j)
            break 

print(relabeled_y_test[:10])

[10, 8, 10, 2, 4, 5, 5, 8, 9, 5]


In [39]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_pred_labels, relabeled_y_test)
print(cm)
accuracy_score(y_pred_labels, relabeled_y_test)

[[  3   9  47  44  73 270  53 204 588  96 308 103 462]
 [  7   6  24  39 141 435  21 343 436  81 166 130 336]
 [  0   2   1   2   4  11   5  16  17   4  18   5  26]
 [  0   0   4   2   8  23   6  15  38  10  21   7  42]
 [  0   1   7   3   0   7   9   6  32   3  44   7  57]
 [  0   7  10   4   4   5  14  11  36   6  59  17  78]
 [  1   1   7  10  27  88   3  58  92  25  45  20  79]
 [  3   3  10  11  13  41  27  16 160  22 129  37 177]
 [  1   1   0   0   4   7   7   8  19   2  20   2  31]
 [  1   4  11   9  19  25  43  30 108  11 131  34 169]
 [  3   1  12   9  38  95  13  37 121  15  32  35  96]
 [  0   1   7   8  11  23  39  10  52  12  69  18 104]
 [  0   1   7   6  12  22  11  26  44   5  20   7  37]]


0.019125