<a href="https://colab.research.google.com/github/SIRREG001/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment Analysis Procedure

In [2]:
#Import important libraries from NLP toolkit
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Download necessary nltk data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Data Preprocessing


In [3]:
#Tokenize texts
text = "I love the movie called Extraction!"
tokens = word_tokenize(text)
print(tokens)

['I', 'love', 'the', 'movie', 'called', 'Extraction', '!']


In [4]:
#convert tokens to lowercase
lowercase_tokens= [token.lower() for token in tokens]
print(lowercase_tokens)


['i', 'love', 'the', 'movie', 'called', 'extraction', '!']


In [5]:
#remove stopwords: words that do not have much meaning in the text
stopwords = set(stopwords.words('english'))
filtered_tokens = [token for token in lowercase_tokens if token not in stopwords]
print(filtered_tokens)
print(stopwords)

['love', 'movie', 'called', 'extraction', '!']
{'did', 'that', 'of', 'is', 'when', 're', 'other', "doesn't", 'from', 'each', 'ourselves', 'the', "isn't", 'at', 'yourselves', 'hasn', 've', 'yours', 'will', 'an', 'she', 'where', "it's", 'ours', 'doesn', 'him', 'under', "mightn't", "needn't", 'them', "she's", "haven't", 'i', 'his', 'hers', 'mustn', 'with', 'so', 'being', 'before', 'while', 'whom', 'we', 'wouldn', 'but', 'herself', 'in', 'their', 'ma', "wasn't", 'some', "you're", 'theirs', 'how', 'all', 'myself', "that'll", "hasn't", 'needn', "weren't", 'he', 'm', 'aren', "didn't", 'any', 'its', "hadn't", 'these', 'through', 'now', 'why', "you've", 'shouldn', 'then', 'who', 'your', 'both', 'too', 'o', "don't", 'up', 'over', "wouldn't", 'to', 'not', 'should', 'what', 'and', 'such', "mustn't", 'because', 'hadn', 'for', "aren't", 'are', 'once', 'here', 'me', 'during', 'only', "you'd", 'as', 'have', 'couldn', 'if', 'out', 'll', 'does', 'on', 'be', 's', 't', 'has', 'weren', "should've", 'shan',

In [6]:
#remove punctuations
import re
cleaned_tokens = [re.sub(r'[^\w\s]', '', token) for token in filtered_tokens]
print(cleaned_tokens)

['love', 'movie', 'called', 'extraction', '']


In [7]:
#stemming and lemmatizing
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]
print(stemmed_tokens)

['love', 'movi', 'call', 'extract', '']


Feature Extraction

In [8]:
!pip install scikit-learn




In [9]:
# feature extraction is used to convert text into numerical values so that the algorithm can work on it
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
corpus = ["I love the movie called Extraction!", "This movie is great.", "I don't like Money heist"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['called' 'don' 'extraction' 'great' 'heist' 'is' 'like' 'love' 'money'
 'movie' 'the' 'this']
[[1 0 1 0 0 0 0 1 0 1 1 0]
 [0 0 0 1 0 1 0 0 0 1 0 1]
 [0 1 0 0 1 0 1 0 1 0 0 0]]


In [12]:
vectorizer2 = TfidfVectorizer()
Y = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names_out())
print(Y.toarray())

['called' 'don' 'extraction' 'great' 'heist' 'is' 'like' 'love' 'money'
 'movie' 'the' 'this']
[[0.46735098 0.         0.46735098 0.         0.         0.
  0.         0.46735098 0.         0.35543247 0.46735098 0.        ]
 [0.         0.         0.         0.52863461 0.         0.52863461
  0.         0.         0.         0.40204024 0.         0.52863461]
 [0.         0.5        0.         0.         0.5        0.
  0.5        0.         0.5        0.         0.         0.        ]]


Working on real data: Financial Sentiment data from Kaggle.com

Using SVM, a ML algorithm for Sentiment Analysis:

In [18]:
#Importing important packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [19]:
#load the dataset from csv

data = pd.read_csv('data.csv')
X = data['Sentence']
y = data['Sentiment']

In [20]:
print(X)
print(y)

0       The GeoSolutions technology will leverage Bene...
1       $ESI on lows, down $1.50 to $2.50 BK a real po...
2       For the last quarter of 2010 , Componenta 's n...
3       According to the Finnish-Russian Chamber of Co...
4       The Swedish buyout firm has sold its remaining...
                              ...                        
5837    RISING costs have forced packaging producer Hu...
5838    Nordic Walking was first used as a summer trai...
5839    According shipping company Viking Line , the E...
5840    In the building and home improvement trade , s...
5841    HELSINKI AFX - KCI Konecranes said it has won ...
Name: Sentence, Length: 5842, dtype: object
0       positive
1       negative
2       positive
3        neutral
4        neutral
          ...   
5837    negative
5838     neutral
5839     neutral
5840     neutral
5841    positive
Name: Sentiment, Length: 5842, dtype: object


In [22]:
#split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
#Feature extraction
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [24]:
#Train the SVM classifier: Which is a ML algorithm used for the Sentiment analysis
clf = svm.SVC()
clf.fit(X_train_features, y_train)

In [25]:
#Make predictions
y_predict = clf.predict(X_test_features)

#Evaluate the model
accuracy = accuracy_score(y_test, y_predict)
print("accuracy:", accuracy)

accuracy: 0.6920444824636441


Using Keras, a ML algorithm for Sentiment Analysis

In [26]:
#import important modules
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [27]:
#split the dataset into features(x) and labels(y)
sentences = data['Sentence'].values
#set positive sentiment to the value of 1 and negative sentiment to 0
labels = data['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [28]:
#Tokenize the sentences
tokenizer = Tokenizer(num_words=5000, oov_token='<00V>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [29]:
#pad the sequences
padded_sequences = pad_sequences(sequences, padding='post')

#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [30]:
#define the neural network architecture
vocab_size = len(tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index for padding
embedding_dim = 100 #any size can be chosen here for the embedding_dim
max_length = len(max(sequences, key=len))

model = keras.Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=120))
model.add(Dense(units=1, activation='sigmoid'))

#compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [31]:
#Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

#Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.6264046430587769
Accuracy: 0.6817793250083923
