<a href="https://colab.research.google.com/github/OziomaEunice/Sentiment_GPT/blob/develop/SVM_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SVM for Sentiment Analysis**

In [1]:
# install the necessary libraries
! pip install numpy
! pip install pandas
! pip install scikit-learn
! pip install nltk
! pip install openpyxl # for reading excel files



In [2]:
# import the necessary libraries
import numpy as np
import pandas as pd
import nltk
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# Download the stopwords dataset
nltk.download('stopwords')

# Download wordnet dataset
nltk.download('wordnet')

# Download punkt dataset
nltk.download('punkt')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# since dataset is imported to my Google Colab (which will remain in here temporary)
# read csv file
df = pd.read_excel('/content/Tweets.xlsx')

In [5]:
# show dataframe
df.head(10)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760512,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:00,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:00,,Pacific Time (US & Canada)
2,570301083672813568,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:00,Lets Play,Central Time (US & Canada)
3,570301031407624192,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:00,,Pacific Time (US & Canada)
4,570300817074462720,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:00,,Pacific Time (US & Canada)
5,570300767074181120,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:00,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:00,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:00,Los Angeles,Pacific Time (US & Canada)
8,570299953286942720,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:00,San Diego,Pacific Time (US & Canada)
9,570295459631263744,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:00,Los Angeles,Eastern Time (US & Canada)


In [6]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   tweet_id                      14640 non-null  int64         
 1   airline_sentiment             14640 non-null  object        
 2   airline_sentiment_confidence  14640 non-null  float64       
 3   negativereason                9178 non-null   object        
 4   negativereason_confidence     10522 non-null  float64       
 5   airline                       14640 non-null  object        
 6   airline_sentiment_gold        40 non-null     object        
 7   name                          14640 non-null  object        
 8   negativereason_gold           32 non-null     object        
 9   retweet_count                 14640 non-null  int64         
 10  text                          14640 non-null  object        
 11  tweet_coord                 

## **Preprocessing Dataset**

In [7]:
# clean (preprocess) the dataset
def cleanData(text, min_word_length = 3):
  text = text.lower()
  text = re.sub(r'@[A-Za-z0-9]+', "", text) # this informs Python the the mentions in text must be substituted with an empty string
  text = re.sub(r'#', "", text) # removing #
  text = re.sub(r'RT[\s]+', "", text) # removing retweets
  text = re.sub(r'https?:\/\/\S+', "", text) # removing links
  text = ' '.join(word for word in text.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      text = text.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(lemmatized_words)

  return text

In [8]:
# df['text'] = df['text'].apply(cleanData)

# Apply the cleanData function to all text columns in the dataframe
df = df.applymap(lambda x: cleanData(x) if isinstance(x, str) else x)

In [9]:
df[['tweet_id', 'airline_sentiment', 'text']].head(20)

Unnamed: 0,tweet_id,airline_sentiment,text
0,570306133677760512,neutral,said
1,570301130888122368,positive,plus added commercial experience tacky
2,570301083672813568,neutral,today must mean need take another trip
3,570301031407624192,negative,really aggressive blast obnoxious `` entertain...
4,570300817074462720,negative,really big bad thing
5,570300767074181120,negative,seriously would pay 30 flight seat playing rea...
6,570300616901320704,positive,"yes , nearly every time fly “ ear worm ” won ’..."
7,570300248553349120,neutral,really missed prime opportunity men without ha...
8,570299953286942720,positive,"well , didnt…but do d"
9,570295459631263744,positive,"amazing , arrived hour early good me"


## **Feature Extraction: TF-IDF Vectorisation**

In [10]:
# initialise the tf-idf vectoriser (or model) and store it in a variable
tfidf_vector = TfidfVectorizer()

# print
print(tfidf_vector.fit_transform(df['text'])) # or tfidf_vector.fit_transform(df['text']).todense()  to make show more of a concise format

  (0, 10537)	1.0
  (1, 11679)	0.587226324074011
  (1, 5246)	0.3108474814709016
  (1, 3731)	0.4568755914870982
  (1, 1893)	0.44773057542842387
  (1, 9477)	0.3864467793986239
  (2, 12202)	0.3668452157938143
  (2, 2226)	0.35348883403300063
  (2, 11691)	0.34716649846261827
  (2, 8493)	0.29524407238130057
  (2, 8026)	0.4287492702497299
  (2, 8392)	0.4950254109755879
  (2, 12039)	0.32233600121746286
  (3, 10022)	0.3559086433379896
  (3, 7661)	0.2577184051556976
  (3, 2161)	0.17109355018711245
  (3, 5307)	0.31884927949014985
  (3, 6246)	0.33950961359090287
  (3, 5048)	0.2966026642795287
  (3, 8811)	0.3839429698578427
  (3, 2884)	0.3839429698578427
  (3, 1977)	0.37230767308507634
  (3, 9964)	0.19822982992323318
  (4, 11896)	0.5110998629027443
  (4, 2603)	0.48173574286467014
  :	:
  (14636, 5547)	0.20003777734326794
  (14637, 2869)	0.6686166614800932
  (14637, 2141)	0.42115141808130785
  (14637, 3065)	0.4553684748540195
  (14637, 2019)	0.28995750223081523
  (14637, 9446)	0.2900817851029409
  (1

In [11]:
# see TF-IDF vocabulary
tfidf_vector.vocabulary_

{'said': 10537,
 'plus': 9477,
 'added': 1893,
 'commercial': 3731,
 'experience': 5246,
 'tacky': 11679,
 'today': 12039,
 'must': 8392,
 'mean': 8026,
 'need': 8493,
 'take': 11691,
 'another': 2226,
 'trip': 12202,
 'really': 9964,
 'aggressive': 1977,
 'blast': 2884,
 'obnoxious': 8811,
 'entertainment': 5048,
 'guest': 6246,
 'face': 5307,
 'amp': 2161,
 'little': 7661,
 'recourse': 10022,
 'big': 2832,
 'bad': 2603,
 'thing': 11896,
 'seriously': 10776,
 'would': 13345,
 'pay': 9234,
 '30': 803,
 'flight': 5547,
 'seat': 10687,
 'playing': 9439,
 'flying': 5683,
 'yes': 13434,
 'nearly': 8490,
 'every': 5149,
 'time': 11986,
 'fly': 5672,
 'ear': 4856,
 'worm': 13315,
 'won': 13276,
 'away': 2531,
 'missed': 8230,
 'prime': 9675,
 'opportunity': 8958,
 'men': 8078,
 'without': 13246,
 'hat': 6351,
 'parody': 9173,
 'there': 11871,
 'well': 13075,
 'didnt': 4509,
 'but': 3182,
 'do': 4680,
 'amazing': 2130,
 'arrived': 2363,
 'hour': 6613,
 'early': 4859,
 'good': 6099,
 'me': 802

In [12]:
# place the whole of TF-IDF into a dataframe
tfidf_df = pd.DataFrame(tfidf_vector.fit_transform(df['text']).todense())
tfidf_df.columns = sorted(tfidf_vector.vocabulary_)

tfidf_df # show output of this new dataframe

Unnamed: 0,00,000,000ft,000lbs,0011,0016,006,0162389030167,0162424965446,0162431184663,...,zigzagging,zip,zipper,zombie,zone,zoom,zrh,zukes,zurich,zurichnew
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


### **Converting labels into integers or numeric values for ML**
**The purpose of this conversion is so that machines can understand the labels, since they aren't able to understand text data**

In [13]:
# create a new column called data and link the airline_sentiment to it
# (this 'label' column will have numbers indicating 1, 0, -1
# (positive, neutral, negative)
df['label'] = df['airline_sentiment'].apply(lambda x: int(1) if str(x) == 'positive' else (0 if x == 'neutral' else -1) )

In [17]:
df[['tweet_id', 'airline_sentiment', 'text', 'label']].head(20)

Unnamed: 0,tweet_id,airline_sentiment,text,label
0,570306133677760512,neutral,said,0
1,570301130888122368,positive,plus added commercial experience tacky,1
2,570301083672813568,neutral,today must mean need take another trip,0
3,570301031407624192,negative,really aggressive blast obnoxious `` entertain...,-1
4,570300817074462720,negative,really big bad thing,-1
5,570300767074181120,negative,seriously would pay 30 flight seat playing rea...,-1
6,570300616901320704,positive,"yes , nearly every time fly “ ear worm ” won ’...",1
7,570300248553349120,neutral,really missed prime opportunity men without ha...,0
8,570299953286942720,positive,"well , didnt…but do d",1
9,570295459631263744,positive,"amazing , arrived hour early good me",1


## **Training SVM**

In [15]:
#

## **Testing SVM**

In [16]:
#