<a href="https://colab.research.google.com/github/OziomaEunice/Sentiment_GPT/blob/develop/SVM_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SVM for Sentiment Analysis**

In [1]:
# install the necessary libraries
! pip install numpy
! pip install pandas
! pip install scikit-learn
! pip install nltk
! pip install openpyxl # for reading excel files



In [2]:
# import the necessary libraries
import numpy as np
import pandas as pd
import nltk
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
# Download the stopwords dataset
nltk.download('stopwords')

# Download wordnet dataset
nltk.download('wordnet')

# Download punkt dataset
nltk.download('punkt')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
# since dataset is imported to my Google Colab (which will remain in here temporary)
# read csv file
df = pd.read_excel('/content/Tweets.xlsx')

In [12]:
# show dataframe
df.head(10)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760512,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:00,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:00,,Pacific Time (US & Canada)
2,570301083672813568,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:00,Lets Play,Central Time (US & Canada)
3,570301031407624192,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:00,,Pacific Time (US & Canada)
4,570300817074462720,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:00,,Pacific Time (US & Canada)
5,570300767074181120,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:00,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:00,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:00,Los Angeles,Pacific Time (US & Canada)
8,570299953286942720,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:00,San Diego,Pacific Time (US & Canada)
9,570295459631263744,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:00,Los Angeles,Eastern Time (US & Canada)


In [13]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   tweet_id                      14640 non-null  int64         
 1   airline_sentiment             14640 non-null  object        
 2   airline_sentiment_confidence  14640 non-null  float64       
 3   negativereason                9178 non-null   object        
 4   negativereason_confidence     10522 non-null  float64       
 5   airline                       14640 non-null  object        
 6   airline_sentiment_gold        40 non-null     object        
 7   name                          14640 non-null  object        
 8   negativereason_gold           32 non-null     object        
 9   retweet_count                 14640 non-null  int64         
 10  text                          14640 non-null  object        
 11  tweet_coord                 

## **Preprocessing Dataset**

In [14]:
# clean (preprocess) the dataset
def cleanData(text, min_word_length = 3):
  text = text.lower()
  text = re.sub(r'@[A-Za-z0-9]+', "", text) # this informs Python the the mentions in text must be substituted with an empty string
  text = re.sub(r'#', "", text) # removing #
  text = re.sub(r'RT[\s]+', "", text) # removing retweets
  text = re.sub(r'https?:\/\/\S+', "", text) # removing links
  text = ' '.join(word for word in text.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      text = text.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(lemmatized_words)

  return text

In [15]:
# df['text'] = df['text'].apply(cleanData)

# Apply the cleanData function to all text columns in the dataframe
df = df.applymap(lambda x: cleanData(x) if isinstance(x, str) else x)

In [16]:
df[['tweet_id', 'airline_sentiment', 'text']].head(20)

Unnamed: 0,tweet_id,airline_sentiment,text
0,570306133677760512,neutral,said
1,570301130888122368,positive,plus added commercial experience tacky
2,570301083672813568,neutral,today must mean need take another trip
3,570301031407624192,negative,really aggressive blast obnoxious `` entertain...
4,570300817074462720,negative,really big bad thing
5,570300767074181120,negative,seriously would pay 30 flight seat playing rea...
6,570300616901320704,positive,"yes , nearly every time fly “ ear worm ” won ’..."
7,570300248553349120,neutral,really missed prime opportunity men without ha...
8,570299953286942720,positive,"well , didnt…but do d"
9,570295459631263744,positive,"amazing , arrived hour early good me"


## **Feature Extraction: TF-IDF Vectorisation**

In [13]:
# consider individual words as tokens

## **Training SVM**

In [None]:
#

## **Testing SVM**

In [None]:
#