https://towardsdatascience.com/basics-of-countvectorizer-e26677900f9c


Sentiment analysis of movie reviews in Tanglish text (tokenizer)

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping




data = pd.read_csv('/content/Tamil_sentiments.csv', encoding='utf-8')


num_categories = 5


X_train, X_test, y_train, y_test = train_test_split(data['text'], data['category'], test_size=0.2, random_state=42)


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()


y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)


max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)



y_train_cat = to_categorical(y_train, num_classes=num_categories)
y_test_cat = to_categorical(y_test, num_classes=num_categories)


model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=max_len))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_categories, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])






In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_cat), callbacks=[early_stopping])


model.save('tanglish_sentiment_lstm.h5')


loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print('Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10

In [None]:
from keras.models import load_model


model = load_model('tanglish_sentiment_lstm.h5')


test_text = ['Enna kathai la irunthathu theriyala  ']
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq, maxlen=max_len)
pred = model.predict(test_pad)


for p in pred:
    print(np.argmax(p))




1


In [None]:

predicted_category = encoder.inverse_transform([np.argmax(pred)])
print(predicted_category)


['Negative']


Sentiment analysis of movie reviews in Tanglish text (countvectorizer)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping


data = pd.read_csv('/content/Tamil_sentiments.csv', encoding='utf-8')


num_categories = 5


X_train, X_test, y_train, y_test = train_test_split(data['text'], data['category'], test_size=0.2, random_state=42)


vectorizer = CountVectorizer(max_features=5000)

X_train_seq = vectorizer.fit_transform(X_train).toarray()
X_test_seq = vectorizer.transform(X_test).toarray()

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()


y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)


max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


y_train_cat = to_categorical(y_train, num_classes=num_categories)
y_test_cat = to_categorical(y_test, num_classes=num_categories)


model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=max_len))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_categories, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_cat), callbacks=[early_stopping])

# Save the model
model.save('tanglish_sentiment_lstm_countvectorizer.h5')

# Evaluate the performance of the model
loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print('Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6557637453079224


In [None]:
from keras.models import load_model


model = load_model('/content/tanglish_sentiment_lstm_countvectorizer.h5')


test_text = [' நான் இன்னும் கூட உங்கள் புதிய படத்தை பார்க்க மிகவும் ஆசையாக இருக்கிறேன்']
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq, maxlen=max_len)
pred = model.predict(test_pad)


for p in pred:
    print(np.argmax(p))



2


In [None]:

predicted_category = encoder.inverse_transform([np.argmax(pred)])
print(predicted_category)

['Positive']
