In [1]:
# Upgrade gensim just in case.
!pip install -U gensim==4.*
#gensim used for word vectors



In [3]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import PCA  #this used for reducing dimension and visualization but i didn't use it so.
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
!gdown "https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku"
#google news vectors

Downloading...
From (original): https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku
From (redirected): https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku&confirm=t&uuid=2fda02ec-790a-43ba-bc3d-b2bc59abb822
To: /content/GoogleNews-vectors-negative300.bin.gz
100% 1.65G/1.65G [00:13<00:00, 126MB/s] 


In [5]:
embedding_file = './GoogleNews-vectors-negative300.bin.gz'

In [6]:
%%time
word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True, limit=1000000)

CPU times: user 15.9 s, sys: 1.03 s, total: 17 s
Wall time: 17.1 s


In [7]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz"

#yelp polarity dataset

--2024-08-07 17:06:27--  https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.13.235, 52.217.130.128, 3.5.8.66, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.13.235|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [8]:
!tar xvzf /root/input/yelp_review_polarity_csv.tgz

# Show current working directory.
!pwd


yelp_review_polarity_csv/
yelp_review_polarity_csv/train.csv
yelp_review_polarity_csv/readme.txt
yelp_review_polarity_csv/test.csv
/content


In [9]:
yelp_train = pd.read_csv('yelp_review_polarity_csv/train.csv', names=['sentiment', 'review'])
print(yelp_train.shape)

(560000, 2)


In [9]:
yelp_train.head()

Unnamed: 0,sentiment,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [10]:
TRAIN_SIZE = 200000
yelp_train = yelp_train.sample(frac=1, random_state=1)[:TRAIN_SIZE].copy()
print(yelp_train.shape)

(200000, 2)


In [11]:
yelp_train['sentiment'].replace(to_replace=1, value=0, inplace=True)
yelp_train['sentiment'].replace(to_replace=2, value=1, inplace=True)

#sigmoid function used as the output layer of the neural network which gives values in the range 0 to 1. so do either this or hot encode the labels

In [12]:
yelp_train.head()

Unnamed: 0,sentiment,review
39658,1,"omg, the grandma g's pizza with rosemary chick..."
330033,0,Microwaved food over over over seasoned gyro m...
238898,0,"While the food is good, the service leaves a l..."
453536,0,Called the Chandler PD about the sign spinner ...
555973,1,"Just took Ewok, my approx. 6 month old shih tz..."


In [13]:
yelp_train_split, yelp_val_split = train_test_split(yelp_train, train_size=0.85, random_state=1)

In [14]:
# Set up training data.
train_reviews = yelp_train_split['review']
y_train = np.array(yelp_train_split['sentiment'])

# Set up validation data.
val_reviews = yelp_val_split['review']
y_val = np.array(yelp_val_split['sentiment'])

In [15]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=20000,
                                               filters='0123456789!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                               lower=True)

In [16]:
%%time
tokenizer.fit_on_texts(train_reviews)

CPU times: user 17 s, sys: 81.9 ms, total: 17.1 s
Wall time: 17.3 s


In [17]:
%%time
X_train = tokenizer.texts_to_sequences(train_reviews)

CPU times: user 12.2 s, sys: 45.4 ms, total: 12.2 s
Wall time: 13.2 s


In [19]:
# The first review in the training set, vectorized.
print(X_train[0])

[20, 385, 10, 9, 1, 105, 46, 2, 20, 40, 1652, 71, 2467, 12129, 3518, 2906, 145, 459, 25, 1, 134, 2, 20, 75, 816, 1221, 21, 12130, 165, 20, 212, 761, 52, 20, 2284, 18, 71, 446, 908, 2, 1470, 5, 816, 1221, 25, 201, 20, 141, 260, 1715, 3519, 74, 4, 907, 487, 5, 907, 6869, 18, 13, 2166, 725, 3927, 14720, 20, 40, 1243, 11, 4232, 1, 480, 411, 3, 42, 10, 14, 2467, 3, 1462, 5, 153, 1, 5967, 4368, 20, 23, 5, 82, 47, 20, 105, 2133, 4, 134, 154, 3201, 117, 2, 60, 1221, 19, 23, 3103, 260, 335, 7, 1547, 4243, 1221, 2, 20, 202, 152, 5, 816, 14, 567, 288, 648, 33, 4, 212, 65, 12, 1, 19182, 3518, 212, 20, 99, 151, 1417, 39, 20, 142, 1483, 58, 640, 1, 520, 5, 1, 11245, 5284, 2, 1158, 2361, 48, 3, 84, 263, 107, 9, 67, 889, 27, 3, 75, 42, 6190, 3, 114, 60, 237, 853, 60, 4, 3103, 9980, 61, 95, 173, 2703, 269, 9, 1652, 3356, 20, 363, 30, 659, 9, 4, 259, 845, 16, 394, 9, 3103, 354, 3527, 2, 14, 2572, 2726, 1462, 5, 816, 351, 7, 1, 1535, 239, 34, 174, 97, 15, 1462, 5, 816, 4, 112, 2, 253, 684, 313, 70, 1, 39

In [18]:
MAX_REVIEW_LEN = 200
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_REVIEW_LEN)

In [21]:
print(X_train[0])
print(X_train[1])

[12986  1849    27    14  2572  6109    10     2    33     2     1    81
   342   121     5    17     8   370   300    70    74   195     8   910
   681  4280   491     2  3308    10     1  3103  7434   633   502    27
     3  1306    58     5    16  4893    18   189   511    10     4 19183
     2   313   224   494    14  6190     2     8   370   300    70    37
   816    61    97     5  3274    18    33     2   313    33    13     1
   725  1792     5   816     2    13     3    24     5   265    10    72
     3   313   224   240    22   975     2     8   910   681     2   300
    44   342  1834    21   732    61    97  1533    33    13    32    26
  7090   211    94    50   331   261    62    20   114    16   116     2
   226    28    20  4252    18    33    61   607    14     2   681   363
   162    71    54    17    40    27    20   114     8  1792  9333     5
   816  1283    63    22     1     3    62   346    56    63     1   485
    16  5005 16405   193     5  3274    18    33   

In [19]:
X_val = tokenizer.texts_to_sequences(val_reviews)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAX_REVIEW_LEN)

In [20]:
yelp_test = pd.read_csv('yelp_review_polarity_csv/test.csv', names=['sentiment', 'review'])
yelp_test['sentiment'].replace(to_replace=1, value=0, inplace=True)
yelp_test['sentiment'].replace(to_replace=2, value=1, inplace=True)
yelp_test.head()
y_test = np.array(yelp_test['sentiment'])
X_test = tokenizer.texts_to_sequences(yelp_test['review'])
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_REVIEW_LEN)


In [21]:
# + 1 to account for padding token.
num_tokens = len(tokenizer.word_index) + 1

# Initialize a matrix of zeroes of size: vocabulary x embedding dimension.
embedding_dim = 300
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in tokenizer.word_index.items():
  if word_vectors.has_index_for(word):
    embedding_matrix[i] = word_vectors[word].copy()

#this function is for creation of the embedding matrix

In [22]:
# Instantiate the embedding layer.
embedding_layer = layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    input_length=MAX_REVIEW_LEN,
    trainable=True
)


model = keras.Sequential()

# Add layers.
model = keras.Sequential()
model.add(embedding_layer)
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activation='sigmoid'))


# Compile model.
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


# Call fit.
es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
history =  model.fit(X_train, y_train, epochs=20, batch_size=512, validation_data=(X_val, y_val))


# Evaluate the model.
model.evaluate(X_test, y_test)




Epoch 1/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 20ms/step - accuracy: 0.7608 - loss: 0.4667 - val_accuracy: 0.9048 - val_loss: 0.2339
Epoch 2/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9066 - loss: 0.2280 - val_accuracy: 0.9258 - val_loss: 0.1905
Epoch 3/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9172 - loss: 0.2063 - val_accuracy: 0.8608 - val_loss: 0.3449
Epoch 4/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.9239 - loss: 0.1926 - val_accuracy: 0.9192 - val_loss: 0.2029
Epoch 5/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9307 - loss: 0.1768 - val_accuracy: 0.8396 - val_loss: 0.3845
Epoch 6/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9262 - loss: 0.1851 - val_accuracy: 0.8706 - val_loss: 0.3022
Epoch 7/20
[1m333/33

[0.3647443354129791, 0.9059473872184753]

In [None]:
#90% test accuracy. decentish

In [23]:
def sentiment(reviews):
  seqs = tokenizer.texts_to_sequences(reviews)
  seqs = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=MAX_REVIEW_LEN)
  if model.predict(seqs) > 0.5:
    return 'Positive Review'
  else:
    return 'Negative Review'

#sigmoid results are in the range 0 to 1.

In [24]:
#taking some random review from google...

In [25]:
rev1= 'It’s the worst school I have ever seen only taking money idk why it has so much reputation. Every teacher is rude with students they are literally so cruel  no one should join this school. No teacher is supporting with kids no study at all. In PT classes they don’t allow us to play. The reality is that it’s the worst school students are very bad even class rooms are. Broken. English teacher omg thy are so bad rude don’t want to specify names but it’s just eww school teachers have their personal issues with specific students they are biased with specific students. Teachers are literally so slow and speak so slowly that kids are not even able to hear them in class. Even kids can’t go to washroom On the floor their class is they have to go upstairs, principle literally just sits in her office, u can only see her on occasions, teachers discourage'
print(sentiment([rev1]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step
Negative Review


In [27]:
rev2= 'the school treats the children pretty badly the fans work when the children come out the look like they just took a bath , some kids are pretty bad kids here , there are only two medical rooms in the school and both are filled , the washrooms are pretty dirty they do not clean it.'
print(sentiment([rev2]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Negative Review


In [28]:
rev3= 'Nice school with very good teachers and nice envoirment. Nice infrastructure. Proper medical attention. Security camera and gaurds are there. On of the best schools of delhi'
print(sentiment([rev3]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Positive Review


In [30]:
#taking some small reviews...
rev4= 'My experience is best and it is a beautiful school whith great sports and great teacher and technology. Thank you'
rev5= 'Just an average school nothing special here. Uniform is literally awfull. Even when color of shoe is changed they will treat like we did very wrong. There are also many bad students here.'
print(sentiment([rev4]))
print(sentiment([rev5]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Positive Review
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Negative Review
