<a href="https://colab.research.google.com/github/OziomaEunice/Sentiment_GPT/blob/develop/SVM_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SVM for Sentiment Analysis**

In [None]:
# install the necessary libraries
! pip install numpy
! pip install pandas
! pip install scikit-learn
! pip install nltk
! pip install openpyxl # for reading excel files

In [2]:
# import the necessary libraries
import numpy as np
import pandas as pd
import nltk
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# Download the stopwords dataset
nltk.download('stopwords')

# Download wordnet dataset
nltk.download('wordnet')

# Download punkt dataset
nltk.download('punkt')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# since dataset is imported to my Google Colab (which will remain in here temporary)
# read csv file
df = pd.read_excel('/content/Tweets.xlsx')

In [None]:
# show dataframe
df.head(10)

In [None]:
# datatype info
df.info()

## **Preprocessing Dataset**

In [7]:
# clean (preprocess) the dataset
def cleanData(text, min_word_length = 3):
  text = text.lower()
  text = re.sub(r'@[A-Za-z0-9]+', "", text) # this informs Python the the mentions in text must be substituted with an empty string
  text = re.sub(r'#', "", text) # removing #
  text = re.sub(r'RT[\s]+', "", text) # removing retweets
  text = re.sub(r'https?:\/\/\S+', "", text) # removing links
  text = ' '.join(word for word in text.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      text = text.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(lemmatized_words)

  return text

In [8]:
# df['text'] = df['text'].apply(cleanData)

# Apply the cleanData function to all text columns in the dataframe
df = df.applymap(lambda x: cleanData(x) if isinstance(x, str) else x)

In [None]:
df[['tweet_id', 'airline_sentiment', 'text']].head(20)

## **Feature Extraction: TF-IDF Vectorisation**

In [None]:
# initialise the tf-idf vectoriser (or model) and store it in a variable
tfidf_vector = TfidfVectorizer()

# print
print(tfidf_vector.fit_transform(df['text'])) # or tfidf_vector.fit_transform(df['text']).todense()  to make show more of a concise format

In [None]:
# see TF-IDF vocabulary
tfidf_vector.vocabulary_

In [None]:
# place the whole of TF-IDF into a dataframe
tfidf_df = pd.DataFrame(tfidf_vector.fit_transform(df['text']).todense())
tfidf_df.columns = sorted(tfidf_vector.vocabulary_)

tfidf_df # show output of this new dataframe

### **Converting labels into integers or numeric values for ML**
**The purpose of this conversion is so that machines can understand the labels, since they aren't able to understand text data**

In [13]:
# create a new column called data and link the airline_sentiment to it
# (this 'label' column will have numbers indicating 1, 0, -1
# (positive, neutral, negative)
df['label'] = df['airline_sentiment'].apply(lambda x: int(1) if str(x) == 'positive' else (0 if x == 'neutral' else -1) )

In [None]:
df[['tweet_id', 'airline_sentiment', 'text', 'label']].head(20)

### **Split the Dataset into Training and Testing**

In [None]:
text_train, text_test, label_train, label_test = train_test_split(df['text'].values, df['label'].values, test_size = 0.2, random_state = 42, stratify=df['label'].values)

print(len(text_train), len(text_test), len(label_train) , len(label_test))

## **Training SVM**

In [16]:
# train the SVM model
svm = SVC(kernel='linear')


## **Testing SVM**

In [17]:
#