<a href="https://colab.research.google.com/github/OziomaEunice/Sentiment_GPT/blob/develop/SVM_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SVM for Sentiment Analysis**

In [None]:
# install the necessary libraries
! pip install numpy
! pip install pandas
! pip install scikit-learn
! pip install nltk
! pip install openpyxl # for reading excel files

In [2]:
# import the necessary libraries
import numpy as np
import pandas as pd
import nltk
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
# Download the stopwords dataset
nltk.download('stopwords')

# Download wordnet dataset
nltk.download('wordnet')

# Download punkt dataset
nltk.download('punkt')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# since dataset is imported to my Google Colab (which will remain in here temporary)
# read excel file
df = pd.read_excel('/content/Tweets.xlsx') # for Twitter dataset
df2 = pd.read_excel('/content/IMDB_Dataset.xlsx') # for IMDb Movie Review dataset

In [None]:
# check the number of rows and columns
df.shape

In [None]:
df2.shape

In [None]:
# show Twitter dataframe
df.head(10)

In [None]:
# show IMDb dataset
df2.head(10)

In [None]:
# datatype info
df.info()

print('\n***========================***\n')

df2.info()

In [None]:
# counting the number of missing values in the dataset
df.isnull().sum()

In [None]:
df2.isnull().sum()

## **Preprocessing Dataset**

### **Handling Missing Values**

In [12]:
# drop columns that are not needed for processing data
# In this case, for the Twitter dataset
df = df.drop(columns=['tweet_id', 'airline_sentiment_gold', 'negativereason', 'negativereason_confidence', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'])

In [None]:
df.isnull().sum()

In [None]:
df.head(10)

### **Cleaning Dataset**

In [15]:
# clean (preprocess) the Twitter dataset
def cleanData1(text, min_word_length = 3):
  text = text.lower()
  text = re.sub(r'@[A-Za-z0-9]+', "", text) # this informs Python the the mentions in text must be substituted with an empty string
  text = re.sub(r'#', "", text) # removing #
  text = re.sub(r'RT[\s]+', "", text) # removing retweets
  text = re.sub(r'https?:\/\/\S+', "", text) # removing links
  text = ' '.join(word for word in text.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      text = text.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(lemmatized_words)

  return text

In [16]:
# clean (preprocess) the IMDb Movie Review dataset
def cleanData2(review, min_word_length = 3):
  review = review.lower()
  review = re.sub(r'@[A-Za-z0-9]+', "", review) # this informs Python the the mentions in text must be substituted with an empty string
  review = re.sub(r'#', "", review) # removing #
  review = re.sub(r'RT[\s]+', "", review) # removing retweets
  review = re.sub(r'https?:\/\/\S+', "", review) # removing links
  review = ' '.join(word for word in review.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      review = review.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(review)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  review= ' '.join(lemmatized_words)

  return review

In [17]:
# df['text'] = df['text'].apply(cleanData)

# Apply the cleanData function to all text columns in the dataframe
df = df.applymap(lambda x: cleanData1(x) if isinstance(x, str) else x)

In [18]:
df2 = df2.applymap(lambda x: cleanData2(x) if isinstance(x, str) else x)

### **Converting labels into integers or numeric values for ML**
**The purpose of this conversion is so that machines can understand the labels, since they aren't able to understand text data**

In [19]:
# create a new column called data and link the airline_sentiment to it
# (this 'label' column will have numbers indicating 1, 0, -1
# (positive, neutral, negative)
df['label'] = df['airline_sentiment'].apply(lambda x: int(1) if str(x) == 'positive' else (0 if x == 'neutral' else -1))
df2['label'] = df2['sentiment'].apply(lambda x: int(1) if x == 'positive' else (0 if x == 'neutral' else -1))

In [None]:
df[['airline_sentiment', 'text', 'label']].head(20)

## **Feature Extraction: TF-IDF Vectorisation**

### **Split the Dataset into Training and Testing**

In [21]:
text_train, text_test, label_train, label_test = train_test_split(
    df['text'].values, df['label'].values, test_size = 0.2, random_state = 42, stratify=df['label'].values
)

In [None]:
print(len(text_train), len(text_test), len(label_train) , len(label_test))

### **TF-IDF**

In [23]:
# # initialise the tf-idf vectoriser (or model) and store it in a variable
# tfidf_vector = TfidfVectorizer()

# # Fit and transform the entire dataset
# tfidf_matrix = tfidf_vector.fit_transform(df['text'])

# # Display the TF-IDF vocabulary
# print("Vocabulary:", tfidf_vector.vocabulary_)

# # Place the TF-IDF matrix into a DataFrame
# tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=sorted(tfidf_vector.vocabulary_))

# # Display the TF-IDF DataFrame
# print(tfidf_df)

# # Transform the training and testing sets
# tfidf_train = tfidf_vector.transform(text_train)
# # tfidf_test = tfidf_vector.transform(text_test)
# tfidf_test = tfidf_vector.transform(df2['review'].values)

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vector = TfidfVectorizer()

# Fit and transform the entire dataset
tfidf_matrix = tfidf_vector.fit_transform(df['text'])

# Place the TF-IDF matrix into a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.todense())

# Display a subset of the TF-IDF DataFrame
print(tfidf_df.head())

# Transform the training and testing sets
tfidf_train = tfidf_vector.transform(text_train)
tfidf_test = tfidf_vector.transform(text_test)

## **Training SVM**

In [None]:
# train the SVM model
svm = SVC(kernel='linear')

svm.fit(tfidf_train, label_train)

## **Testing SVM**

In [None]:
# Test the SVM model
tfidf_test_imdb = tfidf_vector.transform(df2['review'].values)

predictions_imdb = svm.predict(tfidf_test_imdb)

print('==============================================')
print('\nClassification Report\n')
print('==============================================')

accuracy = accuracy_score(df2['label'].values, predictions_imdb)
accuracy_perc = accuracy * 100

print(f"Accuracy on IMDb dataset: {accuracy_perc:.2f}%")
print('---------------------------\n')

# print(classification_report(label_test,predictions))
print(classification_report(df2['label'], predictions_imdb, zero_division=1))

In [None]:
df2.head(20)