<a href="https://colab.research.google.com/github/SiddanshChawla/Sentiment-Analysis/blob/main/nlp_neural_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

data = pd.read_csv('all-data.csv', encoding='iso-8859-1', names=['sentiment', 'text'])

data.head()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download("stopwords")
nltk.download('punkt')

stopwords = set(stopwords.words('english'))

def process_text(text):
    pattern = '[^a-zA-Z]+'
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [re.sub(pattern, '', token) for token in tokens]
    return tokens

data['tokens'] = data['text'].apply(process_text)

data.head()

data.info()

from sklearn.model_selection import train_test_split
import random

x_train, x_test, y_train, y_test = train_test_split(data['tokens'], data['sentiment'], test_size=0.2, random_state=42)

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

all_words = nltk.FreqDist(word for doc in x_train for word in doc)
word_features = list(all_words.keys())[:1000]

train_features = [(document_features(d, word_features), s) for d, s in zip(x_train, y_train)]
test_features = [(document_features(d, word_features), s) for d, s in zip(x_test, y_test)]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4846 non-null   object
 1   text       4846 non-null   object
 2   tokens     4846 non-null   object
dtypes: object(3)
memory usage: 113.7+ KB


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Preprocess the data
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded_sequences = pad_sequences(sequences, maxlen=100)

In [None]:
import numpy as np

In [None]:
label_map = {'positive': 1, 'negative': -1, 'neutral': 0}

# Convert the list of strings to a list of numeric values
numeric_labels = [label_map[label] for label in data['sentiment']]
list_input = np.array(numeric_labels)

In [None]:
# Split the data into training and testing sets
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(padded_sequences, list_input, test_size=0.2, random_state=42)

In [None]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_rnn, y_train_rnn, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fea769b80a0>

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Define CNN model architecture
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

In [None]:

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train_rnn, y_train_rnn, batch_size=32, epochs=10, validation_data=(X_test_rnn, y_test_rnn))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fea7658b130>