# Model using spaCy
## In conjunction with textBlob, Pandas, and BeautifulSoup

If you are running this for the first time you may need to use the following commands before continuing:

In [20]:
pip3 install pandas==1.3.5
pip3 install spacy==3.2.0
pip3 install spacytextblob
python3 -m spacy download en_core_web_sm

SyntaxError: invalid syntax (145841143.py, line 1)

Spacy: Used for NLP and has the machine learning module
SpacyTextBlob: Used for the sentiment analysis
Pandas: Stores the data as a dataframe table
BeautifulSoup: Used for web scraping
Requests: Makes the connection to the URL

In [13]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [14]:
# Loading our two pipelines
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x174f791a290>

In [15]:
# Using panda to load in our .csv file
df = pd.read_csv("urls.csv") # File we are checking
urls = df["Address"].tolist() # Column name we are checking
url_sent_score = []
url_sent_label = []
url_subj_score = []
url_subj_label = []
total_pos = []
total_neg = []

In [16]:
# Loops through our URLS and scraps the data
for count, x in enumerate(urls):
    url = x

    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
    res = requests.get(url,headers=headers)
    html_page = res.text

    soup = BeautifulSoup(html_page, 'html.parser')
    for script in soup(["script", "style","meta","label","header","footer"]):
        script.decompose()
    page_text = (soup.get_text()).lower()
    page_text = page_text.strip().replace("  ","")
    page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])
    
    # Start the sentiment analysis now
    doc = nlp(page_text)
    sentiment = doc._.blob.polarity
    sentiment = round(sentiment,2)
    subjectivity = doc._.blob.subjectivity
    subjectivity = round(subjectivity,2)
    
    # Gives positive or negative label
    if sentiment >= -0.1 and sentiment <= 0.1:
        sent_label = "Neutral"
    elif sentiment > 0.1:
        sent_label = "Positive"
    elif sentiment < -0.1:
        sent_label = "Negative"
    
    url_sent_label.append(sent_label)
    url_sent_score.append(sentiment)
    url_subj_score.append(subjectivity)
    
    positive_words = []
    negative_words = []

    for x in doc._.blob.sentiment_assessments.assessments:
      if x[1] > 0:
        positive_words.append(x[0][0])
      elif x[1] < 0:
        negative_words.append(x[0][0])
      else:
        pass

    total_pos.append(', '.join(set(positive_words)))
    total_neg.append(', '.join(set(negative_words)))
    
df["Sentiment Score"] = url_sent_score
df["Sentiment Label"] = url_sent_label
df["Subjectivity Score"] = url_subj_score
df["Positive Words"] = total_pos
df["Negative Words"] = total_neg

#optional export to CSV
df.to_csv("sentiment.csv")
df

Unnamed: 0,Address,Sentiment Score,Sentiment Label,Subjectivity Score,Positive Words,Negative Words
0,https://www.thegatewaypundit.com/2022/05/world...,0.07,Neutral,0.35,"new, elementary, not, supporting, confident, s...","single, needless, uncomfortable, dead, other, ..."
1,https://gettr.com/post/ptt4ta7c84,0.3,Positive,0.38,"real, best",
2,https://komonews.com/news/coronavirus/if-covid...,-0.05,Neutral,0.55,,military
3,https://www.lewrockwell.com/2018/11/no_author/...,0.09,Neutral,0.41,"new, most, large, greatest, many, young, old, ...","grim, military, negative, late, frightening, s..."
4,https://nation.com.pk/08-Apr-2021/russia-offer...,0.09,Neutral,0.26,"new, full, strong, many, economic, special, ed...","foreign, actively, mainly, military, past"
5,https://barrie.ctvnews.ca/cfb-borden-based-mil...,0.02,Neutral,0.4,"new, full, large, overwhelmingly, many, econom...","cold, differently, very, military, angry, wet,..."
6,https://www.reuters.com/world/us/us-army-disch...,0.08,Neutral,0.34,"new, win, first, latest, main, filled, more","center, roughly, other, military, active, least"
7,https://www.dailysabah.com/world/europe/sweden...,0.01,Neutral,0.26,"new, first, directly, direct, light, kind","foreign, previously, orthodox, long, military,..."
8,https://www.military.com/daily-news/2022/11/01...,0.08,Neutral,0.42,"most, new, full, developed, many, latest, larg...","firm, military, complicated, troubled, arrest,..."
9,https://roundingtheearth.substack.com/p/defini...,0.1,Neutral,0.41,"most, full, brave, strong, large, greatest, ma...","previously, hidden, single, very, grey, milita..."


Running into issues when the webpage can't be reached. For example the below url may have been taken down or something and is no longer reachable. Need to find a solution to this:
https://adversereactionreport.com/vaccine-injured/779-athlete-cardiac-arrests-serious-issues-500-dead-after-jab/

Error Here: SSLError: HTTPSConnectionPool(host='adversereactionreport.com', port=443): Max retries exceeded with url: /vaccine-injured/779-athlete-cardiac-arrests-serious-issues-500-dead-after-jab/ (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:997)')))

Code Below Fixed it: Also added a more precise labeling system. The sentiment score is based on the average of THIS data set instead of 0.

In [28]:
# Using panda to load in our .csv file
df = pd.read_csv("urls130.csv") # File we are checking
urls = df["Address"].tolist() # Column name we are checking
url_sent_score = []
url_sent_label = []
url_subj_score = []
url_subj_label = []
total_pos = []
total_neg = []

In [29]:
# Loops through our URLS and scraps the data
for count, x in enumerate(urls):
    if(count % 10 == 0):
        print(str(count))
    try:
        url = x

        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        res = requests.get(url,headers=headers)
        html_page = res.text

        soup = BeautifulSoup(html_page, 'html.parser')
        for script in soup(["script", "style","meta","label","header","footer"]):
            script.decompose()
        page_text = (soup.get_text()).lower()
        page_text = page_text.strip().replace("  ","")
        page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])

        # Start the sentiment analysis now
        doc = nlp(page_text)
        sentiment = doc._.blob.polarity
        sentiment = round(sentiment,2)
        subjectivity = doc._.blob.subjectivity
        subjectivity = round(subjectivity,2)

        # Gives positive or negative label
        if sentiment >= 0.033 and sentiment <= 0.043:
            sent_label = "Neutral"
        elif sentiment > 0.043 and sentiment < 0.143:
            sent_label = "Neutral Positive"
        elif sentiment > 0.143:
            sent_label = "Positive"
        elif sentiment < 0.033 and sentiment > -0.062:
            sent_label = "Neutral Negative"
        elif sentiment < -0.062:
            sent_label = "Negative"

        url_sent_label.append(sent_label)
        url_sent_score.append(sentiment)
        url_subj_score.append(subjectivity)

        positive_words = []
        negative_words = []

        for x in doc._.blob.sentiment_assessments.assessments:
          if x[1] > 0:
            positive_words.append(x[0][0])
          elif x[1] < 0:
            negative_words.append(x[0][0])
          else:
            pass

        total_pos.append(', '.join(set(positive_words)))
        total_neg.append(', '.join(set(negative_words)))
    except:
        url_sent_label.append("Error")
        url_sent_score.append(0.0)
        url_subj_score.append(0.0)

        positive_words = []
        negative_words = []

        total_pos.append(', '.join(set(positive_words)))
        total_neg.append(', '.join(set(negative_words)))

print("The average sentiment score was: " + str(sum(url_sent_score) / len(url_sent_score)))
print("The average subjectivity score was: " + str(sum(url_subj_score) / len(url_subj_score)))

df["Sentiment Score"] = url_sent_score
df["Sentiment Label"] = url_sent_label
df["Subjectivity Score"] = url_subj_score
df["Positive Words"] = total_pos
df["Negative Words"] = total_neg

#optional export to CSV
df.to_csv("sentiment130.csv")
df

0
10
20
30
40
50
60
70
80
90
100
110
120
The average sentiment score was: 0.03807692307692308
The average subjectivity score was: 0.34446153846153854


Unnamed: 0,Address,Sentiment Score,Sentiment Label,Subjectivity Score,Positive Words,Negative Words
0,https://www.thegatewaypundit.com/2022/05/world...,0.07,Neutral Positive,0.35,"new, elementary, not, supporting, confident, s...","single, needless, uncomfortable, dead, other, ..."
1,https://gettr.com/post/ptt4ta7c84,0.30,Positive,0.38,"real, best",
2,https://komonews.com/news/coronavirus/if-covid...,-0.05,Neutral Negative,0.55,,military
3,https://www.lewrockwell.com/2018/11/no_author/...,0.09,Neutral Positive,0.41,"new, most, large, greatest, many, young, old, ...","grim, military, negative, late, frightening, s..."
4,https://nation.com.pk/08-Apr-2021/russia-offer...,0.09,Neutral Positive,0.26,"new, full, strong, many, economic, special, ed...","foreign, actively, mainly, military, past"
...,...,...,...,...,...,...
125,https://youtu.be/9jMONZMuS2U,0.07,Neutral Positive,0.23,new,
126,https://www.militaryreligiousfreedom.org/2022/...,0.05,Neutral Positive,0.42,"most, new, large, many, developed, very, old, ...","forced, impossible, military, insane, late, no..."
127,https://www.scribd.com/document/575750142/ECF-...,-0.03,Neutral Negative,0.68,"new, :), *), =), useful",":(, =(, not"
128,https://www.theepochtimes.com/investigative-re...,0.02,Neutral Negative,0.25,"new, economic, first, free, real, largely, mor...","failed, military, allegedly"


# Trying to Find a way to only run the first X amount
Trying to find the best way to run sentiment analysis on say the first 100 vs the entire block of URL's. In theory it should be easy, just need to learn abound Pandas more as we are changing to original df.

In [30]:
# Using panda to load in our .csv file
df = pd.read_csv("milVaccine.csv") # File we are checking
urls = df["original_url"].tolist() # Column name we are checking
url_sent_score = []
url_sent_label = []
url_subj_score = []
url_subj_label = []
total_pos = []
total_neg = []

In [31]:
# Loops through our URLS and scraps the data
breakAt = 100
for count, x in enumerate(urls):
    if(count % 10 == 0):
        print(str(count) + "/" + str(breakAt))
    if count != breakAt:
        try:
            url = x

            headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
            res = requests.get(url,headers=headers)
            html_page = res.text

            soup = BeautifulSoup(html_page, 'html.parser')
            for script in soup(["script", "style","meta","label","header","footer"]):
                script.decompose()
            page_text = (soup.get_text()).lower()
            page_text = page_text.strip().replace("  ","")
            page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])

            # Start the sentiment analysis now
            doc = nlp(page_text)
            sentiment = doc._.blob.polarity
            sentiment = round(sentiment,2)
            subjectivity = doc._.blob.subjectivity
            subjectivity = round(subjectivity,2)

            # Gives positive or negative label
            if sentiment >= -0.1 and sentiment <= 0.1:
                sent_label = "Neutral"
            elif sentiment > 0.1:
                sent_label = "Positive"
            elif sentiment < -0.1:
                sent_label = "Negative"

            url_sent_label.append(sent_label)
            url_sent_score.append(sentiment)
            url_subj_score.append(subjectivity)

            positive_words = []
            negative_words = []

            for x in doc._.blob.sentiment_assessments.assessments:
              if x[1] > 0:
                positive_words.append(x[0][0])
              elif x[1] < 0:
                negative_words.append(x[0][0])
              else:
                pass

            total_pos.append(', '.join(set(positive_words)))
            total_neg.append(', '.join(set(negative_words)))
        except:
            url_sent_label.append("Error")
            url_sent_score.append(0.0)
            url_subj_score.append(0.0)

            positive_words = []
            negative_words = []

            total_pos.append(', '.join(set(positive_words)))
            total_neg.append(', '.join(set(negative_words)))
    else:
        break
    
df["Sentiment Score"] = url_sent_score
df["Sentiment Label"] = url_sent_label
df["Subjectivity Score"] = url_subj_score
df["Positive Words"] = total_pos
df["Negative Words"] = total_neg

#optional export to CSV
df.to_csv("sentiment.csv")
df

0/100
10/100
20/100
30/100
40/100
50/100
60/100
70/100
80/100
90/100
100/100


ValueError: Length of values (100) does not match length of index (58281)