In [13]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [14]:
data = pd.read_csv('/Users/Administrator/PycharmProjects/pythonProject2/ML5/code/Sentiment.csv')

# Keeping only the neccessary columns
data = data[['text','sentiment']]
type(data)

pandas.core.frame.DataFrame

In [15]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [16]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

In [17]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

In [18]:
X.shape

(13871, 28)

In [19]:
def createmodel():
  embed_dim=128
  lstm_out=196
  model = Sequential()
  model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
  model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(3,activation='softmax'))
  model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  return model
# print(model.summary())

In [20]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [21]:
print(data['sentiment'])
print(integer_encoded)

0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object
[1 2 1 ... 2 0 2]


In [22]:
batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)

Epoch 1/20
291/291 - 38s - loss: 0.8258 - accuracy: 0.6419
Epoch 2/20
291/291 - 34s - loss: 0.6822 - accuracy: 0.7087
Epoch 3/20
291/291 - 32s - loss: 0.6077 - accuracy: 0.7451
Epoch 4/20
291/291 - 31s - loss: 0.5610 - accuracy: 0.7648
Epoch 5/20
291/291 - 32s - loss: 0.5133 - accuracy: 0.7892
Epoch 6/20
291/291 - 32s - loss: 0.4730 - accuracy: 0.8034
Epoch 7/20
291/291 - 32s - loss: 0.4344 - accuracy: 0.8261
Epoch 8/20
291/291 - 32s - loss: 0.3984 - accuracy: 0.8407
Epoch 9/20
291/291 - 32s - loss: 0.3641 - accuracy: 0.8549
Epoch 10/20
291/291 - 32s - loss: 0.3405 - accuracy: 0.8604
Epoch 11/20
291/291 - 33s - loss: 0.3166 - accuracy: 0.8727
Epoch 12/20
291/291 - 32s - loss: 0.3053 - accuracy: 0.8768
Epoch 13/20
291/291 - 32s - loss: 0.2833 - accuracy: 0.8880
Epoch 14/20
291/291 - 32s - loss: 0.2713 - accuracy: 0.8914
Epoch 15/20
291/291 - 33s - loss: 0.2656 - accuracy: 0.8917
Epoch 16/20
291/291 - 32s - loss: 0.2461 - accuracy: 0.8981
Epoch 17/20
291/291 - 32s - loss: 0.2372 - accura

In [23]:
# MODEL SAVED
model.save("model.h5")

In [25]:
# EXTRACTING MODEL
from keras.models import load_model
model_save= load_model('/Users/Administrator/PycharmProjects/pythonProject2/ML5/code/model.h5')

In [26]:
# PREDICTING GIVEN RESULTS
import numpy as np
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing']
#vectorizing the tweet by the pre-fitted tokenizer instance
sentence = tokenizer.texts_to_sequences(sentence)
#padding the tweet to have exactly the same shape as `embedding_2` input
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0)

sentiment = model_save.predict_classes(sentence,batch_size=1)[0]


if sentiment == 0:
  print("negative")
elif sentiment==1:
     print("neutral")
else:
        print("positve")



positve


In [27]:
# 2. GRID SEARCH

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
# model = createmodel()
model = KerasClassifier(build_fn=createmodel,verbose=2)
batch_size= [40, 50, 60]
epochs = [1, 2, 3]
param_grid= dict(batch_size=batch_size, epochs=epochs)
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result= grid.fit(X_train, y=Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

186/186 - 27s - loss: 0.8401 - accuracy: 0.6337
47/47 - 2s - loss: 0.7581 - accuracy: 0.6579
186/186 - 27s - loss: 0.8484 - accuracy: 0.6324
47/47 - 1s - loss: 0.7906 - accuracy: 0.6686
186/186 - 28s - loss: 0.8433 - accuracy: 0.6347
47/47 - 1s - loss: 0.7749 - accuracy: 0.6708
186/186 - 28s - loss: 0.8425 - accuracy: 0.6367
47/47 - 1s - loss: 0.7655 - accuracy: 0.6706
186/186 - 28s - loss: 0.8533 - accuracy: 0.6330
47/47 - 1s - loss: 0.7760 - accuracy: 0.6674
Epoch 1/2
186/186 - 28s - loss: 0.8499 - accuracy: 0.6338
Epoch 2/2
186/186 - 24s - loss: 0.6994 - accuracy: 0.7031
47/47 - 1s - loss: 0.7240 - accuracy: 0.6912
Epoch 1/2
186/186 - 27s - loss: 0.8434 - accuracy: 0.6381
Epoch 2/2
186/186 - 24s - loss: 0.6961 - accuracy: 0.7016
47/47 - 1s - loss: 0.7430 - accuracy: 0.6703
Epoch 1/2
186/186 - 27s - loss: 0.8498 - accuracy: 0.6326
Epoch 2/2
186/186 - 25s - loss: 0.6891 - accuracy: 0.7073
47/47 - 1s - loss: 0.7448 - accuracy: 0.6896
Epoch 1/2
186/186 - 27s - loss: 0.8598 - accuracy: 0

In [28]:
# 3. Text Classification On spam.csv

data1 = pd.read_csv('/Users/Administrator/PycharmProjects/pythonProject2/ML5/code/spam.csv',encoding="ISO-8859-1")

# Keeping only the neccessary columns
data1 = data1[['v1','v2']]

In [30]:
data1['v2'] = data1['v2'].apply(lambda x: x.lower())
data1['v2'] = data1['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [31]:
for idx, row in data1.iterrows():
    row[0] = row[0].replace('rt', ' ')


In [32]:

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data1['v2'].values)
X = tokenizer.texts_to_sequences(data1['v2'].values)

X = pad_sequences(X)

In [33]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data1['v1'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [34]:
def createmodel1():
  embed_dim=128
  lstm_out=196
  model = Sequential()
  model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
  model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(2,activation='softmax'))
  model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  return model

In [35]:
batch_size = 32
model1 = createmodel1()
model1.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)
score,acc = model1.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model1.metrics_names)

Epoch 1/5
117/117 - 81s - loss: 0.1712 - accuracy: 0.9424
Epoch 2/5
117/117 - 82s - loss: 0.0380 - accuracy: 0.9882
Epoch 3/5
117/117 - 122s - loss: 0.0191 - accuracy: 0.9946
Epoch 4/5
117/117 - 125s - loss: 0.0109 - accuracy: 0.9973
Epoch 5/5
117/117 - 143s - loss: 0.0048 - accuracy: 0.9984
58/58 - 9s - loss: 0.1027 - accuracy: 0.9837
0.10268498212099075
0.9836868047714233
['loss', 'accuracy']
