# **Samuel Vasco Gonzalez CC.1152223665**

### Comparison of diferents models for NLP such as TextBlob, Text Analytics of Microsoft Azure, my own model with Sklearn and Transformers RoBerta of Hugging Face.

In [None]:
!pip install azure-ai-textanalytics==5.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting azure-ai-textanalytics==5.2.0
  Downloading azure_ai_textanalytics-5.2.0-py3-none-any.whl (239 kB)
[K     |████████████████████████████████| 239 kB 9.1 MB/s 
Collecting azure-core<2.0.0,>=1.24.0
  Downloading azure_core-1.26.0-py3-none-any.whl (178 kB)
[K     |████████████████████████████████| 178 kB 33.5 MB/s 
[?25hCollecting azure-common~=1.1
  Downloading azure_common-1.1.28-py2.py3-none-any.whl (14 kB)
Collecting msrest>=0.7.0
  Downloading msrest-0.7.1-py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 2.6 MB/s 
Collecting isodate>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 676 kB/s 
Installing collected packages: isodate, azure-core, msrest, azure-common, azure-ai-textanalytics
Successfully installed azure-ai-textanalytics-5.2.0 azure-common-1.1.28 azure-core-1.26.0 isodate-0.6.

In [None]:
import pandas as pd
import numpy as np
import re
import spacy 
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Airline Tweets

In [None]:
data_source_url = "https://raw.githubusercontent.com/mhemmg/datasets/master/nlp/airline_tweets.csv"
airline_tweets = pd.read_csv(data_source_url)
airline_tweets=airline_tweets[["text", "airline_sentiment"]]
airline_tweets

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


* Se cuentan las entradas de valores NaN y se eliminan las filas con NaN.

In [None]:
check_for_any_nan= airline_tweets.isna().any().any()
total_nan_values = airline_tweets.isna().sum().sum()

print("Presencia NaN: "+str(check_for_any_nan))
print ("Total de valores NaN: "+str(total_nan_values))

Presencia NaN: False
Total de valores NaN: 0


* Se cuentan las filas repetidas y se eliminan.

In [None]:
airline_tweets.duplicated().sum() #numero de filas repetidas

188

In [None]:
airline_tweets.drop_duplicates(inplace=True) #eliminacion de filas repetidas

In [None]:
airline_tweets.shape

(14452, 2)

# Identifying columns that will constitute features and data (features and labels)

In [None]:
features = airline_tweets["text"].values
features[:20]

array(['@VirginAmerica What @dhepburn said.',
       "@VirginAmerica plus you've added commercials to the experience... tacky.",
       "@VirginAmerica I didn't today... Must mean I need to take another trip!",
       '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
       "@VirginAmerica and it's a really big bad thing about it",
       "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
       '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)',
       '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP',
       "@virginamerica Well, I didn't…but NOW I DO! :-D",
       "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",
       '@VirginAmerica did you know that suicide is the second leading cause of

In [None]:
labels = airline_tweets["airline_sentiment"].values
labels

array(['neutral', 'positive', 'neutral', ..., 'neutral', 'negative',
       'neutral'], dtype=object)

# Cleaning and pre-procesed tweets.

### English Stop Words

In [None]:
nlp = spacy.load("en_core_web_sm")
spacy_stopwords_en = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords_en), spacy_stopwords_en

(326,
 {"'d",
  "'ll",
  "'m",
  "'re",
  "'s",
  "'ve",
  'a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amount',
  'an',
  'and',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'are',
  'around',
  'as',
  'at',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  'both',
  'bottom',
  'but',
  'by',
  'ca',
  'call',
  'can',
  'cannot',
  'could',
  'did',
  'do',
  'does',
  'doing',
  'done',
  'down',
  'due',
  'during',
  'each',
  'eight',
  'either',
  'eleven',
  'else',
  'elsewhere',
  'empty',
  'enough',
  'even',
  'ever',
  'every',
  'everyone',
  'everything',
  'everywhere',
  'except',
  'few',
  'fifteen',

In [None]:

text_doc=nlp("@virginamerica Well, I didn't…but NOW I DO! :-D")
new_text=""
for token in text_doc:
     if not token.is_stop:
       print(token)
       new_text=new_text+" "+str(token)
print(new_text)

@virginamerica
,
didn't
…
!
:-D
 @virginamerica , didn't … ! :-D


## Function for clean tweets

In [None]:
def limpiar_texto(texto):
  
  texto_clear=[]
  
  for text in texto: 
    #convert to lower case
    nuevo_texto = text.lower()

    #eliminating stop words per each tweet
    text_doc=nlp(nuevo_texto)
    nuevo_texto=""
    for token in text_doc:
      if not token.is_stop:
        nuevo_texto=nuevo_texto+" "+str(token)

    #nuevo_texto = re.sub(r"\!|\@\S+|\@|\#\S+|\$\S+|\%\S+|\^\S+|\&\S+|\*\S+|\?|\,|\.|\-|http\S+|\:|\(|\)|\;|\/"," ",nuevo_texto)
    #removing tweet
    nuevo_texto = re.sub(r"\@\S+"," ",nuevo_texto)

    #removing retweet
    nuevo_texto = re.sub(r"rt \@\S+:"," ",nuevo_texto)

    #removing links
    nuevo_texto = re.sub(r"http\S+"," ",nuevo_texto)

    #removing numbers
    nuevo_texto = re.sub(r"\d+"," ",nuevo_texto)

    #removing special characters
    nuevo_texto = re.sub(r"[^A-Za-z0-9]"," ",nuevo_texto)

    #removing any single alphanumeric character
    nuevo_texto = re.sub(r"\s+\w\s+"," ",nuevo_texto)

    #removing single character from the start or the end
    nuevo_texto = re.sub(r"^[a-zA-Z]\s+|\s+[a-zA-Z]$"," ",nuevo_texto)
    
    #substituting multiple spaces with single space
    nuevo_texto = re.sub(r'\s+', ' ', nuevo_texto, flags=re.I)

    texto_clear.append(nuevo_texto)
    
  return texto_clear

features_clear=limpiar_texto(features)
features_clear[:20]

[' said ',
 ' plus added commercials experience tacky ',
 ' today mean need trip ',
 ' aggressive blast obnoxious entertainment guests faces amp little recourse',
 ' big bad thing',
 ' seriously pay flight seats playing bad thing flying va',
 ' yes nearly time fly vx ear worm wo away ',
 ' missed prime opportunity men hats parody ',
 ' didn ',
 ' amazing arrived hour early good ',
 ' know suicide second leading cause death teens ',
 ' lt pretty graphics better minimal iconography ',
 ' great deal thinking nd trip amp gone st trip ',
 ' flying fabulous seductive skies stress away travel ',
 ' thanks ',
 ' sfo pdx schedule mia ',
 ' excited cross country flight lax mco heard great things virgin america daystogo',
 ' flew nyc sfo week fully sit seat large gentleman help ',
 ' flying ',
 ' know amazingly awesome bos fll want fly ']

In [None]:
airline_tweets['processed_Text']=features_clear
airline_tweets

Unnamed: 0,text,airline_sentiment,processed_Text
0,@VirginAmerica What @dhepburn said.,neutral,said
1,@VirginAmerica plus you've added commercials t...,positive,plus added commercials experience tacky
2,@VirginAmerica I didn't today... Must mean I n...,neutral,today mean need trip
3,@VirginAmerica it's really aggressive to blast...,negative,aggressive blast obnoxious entertainment gues...
4,@VirginAmerica and it's a really big bad thing...,negative,big bad thing
...,...,...,...
14635,@AmericanAir thank you we got on a different f...,positive,thank got different flight chicago
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative,leaving minutes late flight warnings communic...
14637,@AmericanAir Please bring American Airlines to...,neutral,bring american airlines blackberry
14638,"@AmericanAir you have my money, you change my ...",negative,money change flight answer phones suggestions...


Inspecting tweets after text preprocessing, some tweets were left empty " "

In [None]:
airline_tweets[airline_tweets.processed_Text==' ']

Unnamed: 0,text,airline_sentiment,processed_Text
269,@VirginAmerica I see what you did there ;),positive,
669,@united Done and done,neutral,
781,@united you too!,positive,
988,@united done,neutral,
1256,@united I already did.,neutral,
1644,@united now this http://t.co/uygeW2Nosr,negative,
2258,@united never again.,negative,
2603,@united @SCVPools \nPlease call me at 310-795-...,neutral,
2607,@united no u don't,negative,
3509,@united I have.,neutral,


 In total was 60 tweets

In [None]:
print("Empty tweets: ",airline_tweets[airline_tweets.processed_Text==' '].shape[0])
print("Empty tweets represents:", airline_tweets[airline_tweets.processed_Text==' '].shape[0]*100/len(airline_tweets), "% of the dataset")

Empty tweets:  60
Empty tweets represents: 0.41516745087185164 % of the dataset


Thus we eliminate this 60 empty tweets that represents just less of 1% of the data.

In [None]:
airline_tweets.drop(airline_tweets[airline_tweets.processed_Text==' '].index, inplace = True)

In [None]:
airline_tweets

Unnamed: 0,text,airline_sentiment,processed_Text
0,@VirginAmerica What @dhepburn said.,neutral,said
1,@VirginAmerica plus you've added commercials t...,positive,plus added commercials experience tacky
2,@VirginAmerica I didn't today... Must mean I n...,neutral,today mean need trip
3,@VirginAmerica it's really aggressive to blast...,negative,aggressive blast obnoxious entertainment gues...
4,@VirginAmerica and it's a really big bad thing...,negative,big bad thing
...,...,...,...
14635,@AmericanAir thank you we got on a different f...,positive,thank got different flight chicago
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative,leaving minutes late flight warnings communic...
14637,@AmericanAir Please bring American Airlines to...,neutral,bring american airlines blackberry
14638,"@AmericanAir you have my money, you change my ...",negative,money change flight answer phones suggestions...


Variables after to all preprocessed tweets.

In [None]:
features = airline_tweets["text"].values
features_clear = airline_tweets["processed_Text"].values
labels = airline_tweets["airline_sentiment"].values

# Model NLP TextBlob 


In [None]:
def Text_Blob(tweets):
   sentiments = []
   def Sentiment(array):
     array = np.r_[array]
     
     def Masc(val):
       new_val="neutral"
       if val < 0:
         new_val = "negative"
       elif val > 0:
         new_val = "positive"
       return new_val
     
     return np.vectorize(Masc)(array)
     
   for text in tweets:
     res = TextBlob(text)
     sentiments.append(res.sentiment.polarity)
   return Sentiment(sentiments)

## TextBlob Without cleaning tweets

In [None]:
TextBlob_Classifier = Text_Blob(features)

print(accuracy_score(labels, TextBlob_Classifier))

0.46352140077821014


## TextBlob With cleaning tweets

In [None]:
TextBlob_Classifier = Text_Blob(features_clear)

print(accuracy_score(labels, TextBlob_Classifier))

0.44385769872151193


# Model NLP Microsoft Azure

### Authentication

In [None]:
key = "5caa673abe6d4a7f8bfe8f6549eb4d4c" # long key
endpoint = "https://samuvg.cognitiveservices.azure.com/" # long endpoint

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

# Authenticate the client using your key and endpoint 
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [None]:
def Sentiment_Azure(tweet):
  sentiments = []
  for i in range(len(tweet)):
    result = client.analyze_sentiment(tweet[i:i+1], show_opinion_mining=False)
    sentiments.append(result[0].sentiment)
  return sentiments

#### We will use just 500 tweets random for test the model of Text Analytics for sentiment analysis with Microsoft Azure, because the tier paid used is free and this have the constraint of 5000 calls.  

In [None]:
Random_Tweets=np.random.randint(0, len(features_clear), 500)
len(features_clear[Random_Tweets]), len(labels[Random_Tweets])

(500, 500)

## Sentiment Analysis Microsoft Azure without cleaning tweets

In [None]:
SentimentAzure = Sentiment_Azure(features[Random_Tweets])

print(accuracy_score(labels[Random_Tweets], SentimentAzure))

0.582


## Sentiment Analysis Microsoft Azure with cleaning tweets

In [None]:
SentimentAzure = Sentiment_Azure(features_clear[Random_Tweets])

print(accuracy_score(labels[Random_Tweets], SentimentAzure))

0.518


# Training my own NLP Model with Sklearn algorithm

### Vectorization

We convert the data into the numeric form.

In [None]:
vectorizer = TfidfVectorizer() #max_features=2500, min_df=7, max_df=0.8,stop_words=stopwords.words('english')

#without cleaning tweets
X1 = vectorizer.fit_transform(features).toarray()

#with cleaning tweets
X2 = vectorizer.fit_transform(features_clear).toarray()

### Dividing Data into Training and Test Sets

The training set will be used to train the algorithm while the test set will be used to evaluate the performance of the machine learning model.

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, labels, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, labels, test_size=0.2, random_state=0)

## Random Forest Classifier without cleaning tweets

In [None]:
text_classifier1 = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier1.fit(X_train1, y_train1)
predictions = text_classifier1.predict(X_test1)
print(accuracy_score(y_test1, predictions))

0.7526919069121223


## Random Forest Classifier with cleaning tweets

In [None]:
text_classifier2 = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier2.fit(X_train2, y_train2)
predictions = text_classifier2.predict(X_test2)
print(accuracy_score(y_test2, predictions))

0.7572073636679403


It is better the model with tweets clean, thus the next models will be with cleaning tweets.

## Multi Layer Perceptron Classifier

In [None]:
text_classifier3 = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001 )
text_classifier3.fit(X_train2, y_train2)
predictions = text_classifier3.predict(X_test2)
print(accuracy_score(y_test2, predictions))

0.7127474817645015


## Linear Support Vector Classifier

In [None]:
text_classifier4 = LinearSVC(random_state=0, tol=1e-6)
text_classifier4.fit(X_train2, y_train2)
predictions = text_classifier4.predict(X_test2)
print(accuracy_score(y_test2, predictions))

0.7669329628343174


## AdaBoost Classifier

In [None]:
text_classifier5 = AdaBoostClassifier(n_estimators=100, random_state=0)
text_classifier5.fit(X_train2, y_train2)
predictions = text_classifier5.predict(X_test2)
print(accuracy_score(y_test2, predictions))

0.7266411948593261


### Proof of the model

In [None]:
test_features = limpiar_texto(['The trip was normal regular and common'])
test_feature_vec = vectorizer.transform(test_features).toarray()
predictions = text_classifier4.predict(test_feature_vec)
predictions[0]

'neutral'

In [None]:
test_features = limpiar_texto(['it wrongly loaded my return date'])
test_feature_vec = vectorizer.transform(test_features).toarray()
predictions = text_classifier4.predict(test_feature_vec)
predictions[0]

'negative'

In [None]:
test_features = limpiar_texto(['The trip was very great'])
test_feature_vec = vectorizer.transform(test_features).toarray()
predictions = text_classifier4.predict(test_feature_vec)
predictions[0]

'positive'

# Model NLP Hugging Face Transformers roberta

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 8.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 59.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 35.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import csv
import urllib.request

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]



Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
def Tweeter_Roberta(tweets, model):
  sentiments = []
  for tweet in tweets:
    text = preprocess(tweet)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    sentiments.append(labels[ranking[0]])
  return np.r_[sentiments]

## Sentiment Analysis Hugging Face Tweeter Roberta without cleaning tweets

In [None]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

SentimentRoberta = Tweeter_Roberta(features, model)

print(accuracy_score(labels, SentimentRoberta))

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

0.07851584213451918


## Sentiment Analysis Hugging Face Tweeter Roberta with cleaning tweets

In [None]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

SentimentRoberta = Tweeter_Roberta(features_clear, model)

print(accuracy_score(labels, SentimentRoberta))

0.07698721511951084


# Conclusion 

After to look all results of accuracy_score for diferents models for NLP such as TextBlob, Text Analytics of Microsoft Azure, my own model with Sklearn and Transformers RoBerta of Hugging Face. The best Model was Linear Support Vector Classifier with $Accuracy\_Score=0.77$  