### Text Vectorization

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
## Tokenization
text = "I am not a sentimental person but I believe in the utility of sentiment analysis"
tokens = word_tokenize(text)
print(tokens)

['I', 'am', 'not', 'a', 'sentimental', 'person', 'but', 'I', 'believe', 'in', 'the', 'utility', 'of', 'sentiment', 'analysis']


In [4]:
## Lemmatization
lemmatizer = WordNetLemmatizer()
tokens=[lemmatizer.lemmatize(word) for word in tokens]
tokens

['I',
 'am',
 'not',
 'a',
 'sentimental',
 'person',
 'but',
 'I',
 'believe',
 'in',
 'the',
 'utility',
 'of',
 'sentiment',
 'analysis']

In [5]:
## Stemming
okens=word_tokenize(text.lower())
ps = PorterStemmer()
tokens=[ps.stem(word) for word in tokens]
print(tokens)

['I', 'am', 'not', 'a', 'sentiment', 'person', 'but', 'I', 'believ', 'in', 'the', 'util', 'of', 'sentiment', 'analysi']


In [7]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
tokens_new = [j for j in tokens if j not in stopwords]
tokens_new

['I', 'sentiment', 'person', 'I', 'believ', 'util', 'sentiment', 'analysi']

### Lexicon Based Approach
* Relies on an underlying sentiment (or opinion) lexion
* A sentiment lexicon is a list of lexical features (e.g. words) which are generally labeled according to their semantic orientation as either positive or negative
* Manually creating and validating such lists of opinion-bearing features, while being among the most robust methods, is also one of the most time-consuming
* Much of the applied research leveraging sentiment analysis relies heavily on preexisting manually constructed lexicons
    * LICW
    * ANEW
    * SentiWordNet
    * SenticNet
    * VADER
* `Drawback` of using Lexicons to perform sentiment analysis includes:
    * Tend to suffer from inability to process acronyms, initialism, emotioncons, slangs etc. and therefore perform poorly on social media text data
    * They are unable to account for sentiment intensity
    * Unable to process sarcasm

### VADER (Valence Aware Dictionary and sEntiment Reasoner)
`VADER is a lexison developed by Georgia Tech CS Department, which addresses some of the above shortcomings`
* Has incorporated popular slangs (LOL, OMG, ROFL, Nah, Meh, etc.) and Emoticons (🙂😐🙁, etc.)
* Features are rates on a scale of -4 (extremely negative) to +4 (extremely positive) thereby factoring in sentiment intensity
* Has had very successful tests on social media data

#### `Criteria`
1. positive sentiment: compound score >= 0.5
2. neutral sentiment: -0.5 < compound score < 0.5
3. negative sentiment: compound score <= -0.5

In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [13]:
analyser = SentimentIntensityAnalyzer()
analyser.polarity_scores("This is a good course")

{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404}

In [14]:
analyser.polarity_scores("This is an awesome course") # degree modifier

{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6249}

In [15]:
analyser.polarity_scores("The instructor is so cool")

{'neg': 0.0, 'neu': 0.572, 'pos': 0.428, 'compound': 0.4572}

In [16]:
analyser.polarity_scores("The instructor is so cool!!") # exclaimataion changes score
analyser.polarity_scores("The instructor is so COOL!!") # Capitalization changes score

{'neg': 0.0, 'neu': 0.471, 'pos': 0.529, 'compound': 0.6696}

In [17]:
analyser.polarity_scores("Machine learning makes me :)") #emoticons

{'neg': 0.0, 'neu': 0.571, 'pos': 0.429, 'compound': 0.4588}

In [18]:
analyser.polarity_scores("His antics had me ROFL")
analyser.polarity_scores("The movie SUX") #Slangs

{'neg': 0.618, 'neu': 0.382, 'pos': 0.0, 'compound': -0.4995}

### TextBlob
`Using pre-built lexicon WordNet`

In [22]:
from textblob import TextBlob

In [23]:
TextBlob("His").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [24]:
TextBlob("remarkable").sentiment

Sentiment(polarity=0.75, subjectivity=0.75)

In [25]:
TextBlob("His remarkable work ethic impressed me").sentiment

Sentiment(polarity=0.875, subjectivity=0.875)

### Building a Sentiment Analyzer using VADER

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [22]:
url_list = []
date_time = []
news_text = []
headlines = [] 

for i in range(1,3): #parameters of range function correspond to page numbers in the website with news listings
    #get the list of unique urls in the page
    url = 'https://oilprice.com/Energy/Crude-Oil/Page-{}.html'.format(i)
    request = requests.get(url)
    soup = BeautifulSoup(request.text, "html.parser")
    for links in soup.find_all('div', {'class': 'categoryArticle'}):
        for info in links.find_all('a'):
            if info.get('href') not in url_list:
                url_list.append(info.get('href'))

In [11]:
url_list[0]

'https://oilprice.com/Energy/Crude-Oil/US-Oil-Defies-Odds-Races-Towards-Annual-Production-Record.html'

In [23]:
for www in url_list:
    #access each url
    headlines.append(www.split("/")[-1].replace('-',' ').replace('.html', ''))
    request = requests.get(www)
    soup = BeautifulSoup(request.text, "html.parser")
    
    #store date and time of publication of the article
    for dates in soup.find_all('span', {'class': 'article_byline'}):
        date_time.append(dates.text.split('-')[-1])
    
    #store the text of the news
    temp = []
    for news in soup.find_all('p'):
            temp.append(news.text)
    
    #identify the last line of the news article
    for last_sentence in reversed(temp):
        if last_sentence.split(" ")[0]=="By" and last_sentence.split(" ")[-1]=="Oilprice.com":
            break
        elif last_sentence.split(" ")[0]=="By":
            break
    
    #prune non news related text from the scraped data to create the news text
    joined_text = ' '.join(temp[temp.index("More Info")+1:temp.index(last_sentence)])
    news_text.append(joined_text)

In [24]:
# save news text along with the news headline in a dataframe      
news_df = pd.DataFrame({ 'Date' : date_time,
                         'Headline': headlines,
                         'News': news_text,
                       })

In [25]:
# use VADER to perform sentiment analysis on stored news articles
analyser = SentimentIntensityAnalyzer()

def comp_score(text):
    return analyser.polarity_scores(text)["compound"]   
  
news_df["sentiment"] = news_df["News"].apply(comp_score)

In [26]:
news_df.head()

Unnamed: 0,Date,Headline,News,sentiment
0,"Oct 13, 2023, 6:00 PM CDT",Kuwait And Saudi Arabia Team Up For Massive Ga...,As other Middle Eastern states look to diversi...,0.9971
1,"Oct 12, 2023, 3:00 PM CDT",US Oil Defies Odds Races Towards Annual Produc...,One of my 2023 energy sector predictions was t...,0.3877
2,"Oct 12, 2023, 10:08 AM CDT",Oil Moves Down On Massive Inventory Build,Crude oil prices saw a small dip on Thursday m...,-0.8868
3,"Oct 11, 2023, 7:00 PM CDT",Shift In US Policy On Iran Oil Could Swing Glo...,"Back in August, we reported that Iran oil expo...",-0.8818
4,"Oct 11, 2023, 6:00 PM CDT",Shale Consolidation Could Put A Permanent Lid ...,"Back in April, the Wall Street Journal reporte...",0.9938


In [28]:
news_df[['Headline', 'News', 'sentiment']].copy().to_csv('news_df.csv', index= False)

### TF-IDF (Term Frequency - Inverse Document Frequency)
Assign weighting factor used to get the important features from the documents

**TF** = $\frac{number\ of\ times\ the\ term\ t\ appear\ in\ the\ doc}{total\ number\ of\ words\ in\ the\ doc}$

**IDF** = $log_{e}\big(\frac{total\ number\ of\ documents}{number\ of\ documents\ with\ the\ term\ t\ in\ it}\big)$

**TF-IDF** = $TF\ \times\ IDF$

* TF-IDF is used to improve the feature set before feeding into ML model
* It penalized common words and reduces their weights in the feature matrix

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import pickle

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [7]:
data.head()

Unnamed: 0,Headline,News,sentiment
0,Kuwait And Saudi Arabia Team Up For Massive Ga...,As other Middle Eastern states look to diversi...,0.9971
1,US Oil Defies Odds Races Towards Annual Produc...,One of my 2023 energy sector predictions was t...,0.3877
2,Oil Moves Down On Massive Inventory Build,Crude oil prices saw a small dip on Thursday m...,-0.8868
3,Shift In US Policy On Iran Oil Could Swing Glo...,"Back in August, we reported that Iran oil expo...",-0.8818
4,Shale Consolidation Could Put A Permanent Lid ...,"Back in April, the Wall Street Journal reporte...",0.9938


In [8]:
def label_sentiment(x):
    if x > 0.5:
        label = 'Positive'
    elif x < -0.5:
        label = 'Negative'
    else:
        label = 'Neutral'
    return label

In [11]:
data = pd.read_csv("news_df.csv")
data = data[~data['News'].isna()].copy()
data['label'] = data.apply(lambda x: label_sentiment(x['sentiment']), axis = 1)
X = data.iloc[:,1] 
X.head()

0    As other Middle Eastern states look to diversi...
1    One of my 2023 energy sector predictions was t...
2    Crude oil prices saw a small dip on Thursday m...
3    Back in August, we reported that Iran oil expo...
4    Back in April, the Wall Street Journal reporte...
Name: News, dtype: object

In [34]:
vectorizer = CountVectorizer(stop_words = 'english')
X_vec = vectorizer.fit_transform(X)

In [29]:
print(X_vec[0])
## (0, 2290)  2. <-- indicates the word "middle" (2290 is the token for "middle") appearing in article 1 twice

  (0, 2290)	2
  (0, 1268)	1
  (0, 3439)	1
  (0, 2170)	1
  (0, 1195)	1
  (0, 1327)	2
  (0, 2325)	1
  (0, 2059)	20
  (0, 408)	1
  (0, 1668)	1
  (0, 3646)	1
  (0, 2470)	16
  (0, 1631)	13
  (0, 362)	1
  (0, 2060)	1
  (0, 1674)	4
  (0, 387)	1
  (0, 326)	1
  (0, 2204)	2
  (0, 2486)	6
  (0, 2772)	3
  (0, 2242)	1
  (0, 609)	2
  (0, 2777)	11
  (0, 816)	2
  :	:
  (0, 570)	1
  (0, 440)	1
  (0, 2832)	1
  (0, 1117)	1
  (0, 3914)	1
  (0, 3821)	1
  (0, 1758)	1
  (0, 569)	1
  (0, 1960)	1
  (0, 1750)	1
  (0, 1622)	1
  (0, 1899)	2
  (0, 2396)	1
  (0, 1957)	1
  (0, 1193)	1
  (0, 3944)	1
  (0, 2707)	1
  (0, 1419)	1
  (0, 1390)	1
  (0, 564)	1
  (0, 2169)	1
  (0, 2970)	1
  (0, 3591)	1
  (0, 1034)	1
  (0, 294)	2


In [30]:
# print(vectorizer.vocabulary_)

In [35]:
pickle.dump(vectorizer, open("vectorizer_crude_oil", 'wb'))  # Save vectorizor for reuse
X_vec = X_vec.todense() # Convert sparse matrix into dense matrix
X_vec

matrix([[0, 3, 0, ..., 0, 1, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 5, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [36]:
X_vec.shape

(39, 3978)

In [37]:
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()

In [43]:
# Extract the news body and labels for training the classifier
X_train = X_tfidf[:,:]
Y_train = data.iloc[:, 3]

# Train the NB classifier
clf = GaussianNB().fit(X_train, Y_train) 
pickle.dump(clf, open("nb_clf_crude_oil", 'wb'))

In [45]:
#############Importing trained classifier and fitted vectorizer################
nb_clf = pickle.load(open("nb_clf_crude_oil", 'rb'))
vectorizer = pickle.load(open("vectorizer_crude_oil", 'rb'))

nb_clf

GaussianNB()

In [48]:
##############Predict sentiment using the trained classifier###################
# Import test data set
data_pred = pd.read_csv("news_df.csv")  ## Need a new test set
X_test = data_pred[~data_pred['News'].isna()].iloc[:,1] # extract column with news articl
X_vec_test = vectorizer.transform(X_test) #don't use fit_transform here because the model is already fitted
X_vec_test = X_vec_test.todense() #convert sparse matrix to dense

In [50]:
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf_test = tfidf.fit_transform(X_vec_test)
X_tfidf_test = X_tfidf_test.todense()


# Predict the sentiment values
y_pred = nb_clf.predict(X_tfidf_test)
y_pred

array(['Positive', 'Neutral', 'Negative', 'Negative', 'Positive',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Positive',
       'Neutral', 'Negative', 'Negative', 'Negative', 'Positive',
       'Negative', 'Positive', 'Positive', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative'], dtype='<U8')