In [25]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

# Setup Chrome options (optional)
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

# Setup driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [26]:
links = pd.read_excel('Input.xlsx')
links.shape

(170, 2)

In [27]:
links.head()

Unnamed: 0,URL_ID,URL
0,1,https://insights.blackcoffer.com/how-is-login-...
1,2,https://insights.blackcoffer.com/how-does-ai-h...
2,3,https://insights.blackcoffer.com/ai-and-its-im...
3,4,https://insights.blackcoffer.com/how-do-deep-l...
4,5,https://insights.blackcoffer.com/how-artificia...


In [28]:
def scrape_data(link):
    global driver
    try:
        driver.get(link)
        
        # Check if page actually loaded (404 pages often lack your target element)
        try:
            title = driver.find_element(By.XPATH, "//div[contains(@class,'td-post-content')]")
            driver.implicitly_wait(10)
            return title.text
        
        except NoSuchElementException:
            print("Skipping (content not found):", link)
            return None

    except (TimeoutException, WebDriverException):
        print("Skipping (page load error):", link)
        return None


In [29]:
## function to save the scrapped files
import os

def save_file(scrapdata):
    # Create folder if it doesn't exist
    os.makedirs("./Articles", exist_ok=True)
    
    for data in scrapdata:
        name = str(data['URL_ID']) + ".txt"
        filepath = os.path.join("./Articles", name)
        
        with open(filepath, 'w+', encoding='utf-8') as f:
            f.write(data['TEXT'])


In [30]:
# Perform scraping
scrapdata = []

for index, row in links.iterrows():
    text = scrape_data(row['URL'])
    
    if text is None:
        continue   # Skip 404 or failed pages
    
    item = {
        'URL_ID': row['URL_ID'],
        'TEXT': text
    }
    scrapdata.append(item)

save_file(scrapdata)

Skipping (content not found): https://insights.blackcoffer.com/how-do-deep-learning-models-predict-old-and-new-drugs-that-are-successfully-treated-in-healthcare/
Skipping (content not found): https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Skipping (content not found): https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Skipping (content not found): https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/
Skipping (content not found): https://insights.blackcoffer.com/big-data-analytics-solving-problems-banking-and-finance-industry/


In [31]:
#making a dataframe of scrapped data

from  os import listdir
path = 'file:///c:/Users/SuryaB/Desktop/sentimental-analysis-using-web-scrapped-data/Articles/'
files = listdir('./Articles/')


In [38]:
rows = []

for file in files:
    with open('./Articles/' + file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Extract number before ".txt"
    sr = int(os.path.splitext(file)[0])   # <-- the correct way
    
    rows.append({'File': sr, 'text': text})

df = pd.DataFrame(rows)

In [40]:
df = df.sort_values("File")
df
df.to_csv('content.csv', index=None)

In [41]:
df = pd.read_csv('content.csv')
df.head()

Unnamed: 0,File,text
0,1,When people hear AI they often think about sen...
1,2,"With increasing computing power and more data,..."
2,3,If you were a fan of the 90’s film Clueless ba...
3,5,"From the stone age to the modern world, from h..."
4,6,Artificial intelligence (AI) is the developmen...


In [42]:
# Data Preprocessing
df['Number of sentences'] = df['text'].apply(lambda x: len(str(x).split('. ')))

In [43]:
def short_forms():    
    return {
        "cant":"can not",
        "dont":"do not",
        "wont":"will not",
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks",
        "couldn't":"could not",
        "wouldn't":"would not",
        "shouldn't":"should not",
        "im":"i am"
        }

In [45]:
import re
import string

def normalization(data):
    data = str(data).lower()

    # URL removal – now actually valid
    data = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', data)

    # Hashtag cleanup
    data = re.sub(r'#([^\s]+)', r'\1', data)

    # Remove numbers
    data = ''.join([i for i in data if not i.isdigit()])

    # Replace punctuation with space
    for sym in string.punctuation:
        data = data.replace(sym, " ")

    # Replace fancy apostrophe
    data = data.replace("’", "'")

    # Short forms
    short_form = short_forms()
    words = data.split()
    converted = [short_form[word] if word in short_form else word for word in words]

    return " ".join(converted)


In [46]:
df

Unnamed: 0,File,text,Number of sentences
0,1,When people hear AI they often think about sen...,19
1,2,"With increasing computing power and more data,...",20
2,3,If you were a fan of the 90’s film Clueless ba...,39
3,5,"From the stone age to the modern world, from h...",44
4,6,Artificial intelligence (AI) is the developmen...,37
...,...,...,...
160,167,"CAN ACADEMIA, RESEARCHERS, DECISION MAKERS AND...",42
161,168,Introduction\nInventory planning is a fundamen...,29
162,169,1. The Problem\nInsider threat detection speci...,48
163,170,"If we talk in terms of our general life, Exfil...",22


In [49]:
df['text']= df['text'].apply(normalization)

In [50]:
df

Unnamed: 0,File,text,Number of sentences
0,1,when people hear ai they often think about sen...,19
1,2,with increasing computing power and more data ...,20
2,3,if you were a fan of the s film clueless back ...,39
3,5,from the stone age to the modern world from hu...,44
4,6,artificial intelligence ai is the development ...,37
...,...,...,...
160,167,can academia researchers decision makers and p...,42
161,168,introduction inventory planning is a fundament...,29
162,169,the problem insider threat detection specifica...,48
163,170,if we talk in terms of our general life exfilt...,22


In [51]:
df['text']= df['text'].apply(lambda x: x.lower())

In [52]:
df

Unnamed: 0,File,text,Number of sentences
0,1,when people hear ai they often think about sen...,19
1,2,with increasing computing power and more data ...,20
2,3,if you were a fan of the s film clueless back ...,39
3,5,from the stone age to the modern world from hu...,44
4,6,artificial intelligence ai is the development ...,37
...,...,...,...
160,167,can academia researchers decision makers and p...,42
161,168,introduction inventory planning is a fundament...,29
162,169,the problem insider threat detection specifica...,48
163,170,if we talk in terms of our general life exfilt...,22


In [53]:
df.head()

Unnamed: 0,File,text,Number of sentences
0,1,when people hear ai they often think about sen...,19
1,2,with increasing computing power and more data ...,20
2,3,if you were a fan of the s film clueless back ...,39
3,5,from the stone age to the modern world from hu...,44
4,6,artificial intelligence ai is the development ...,37


In [54]:
# Performing Sentiment Analysis


In [55]:
guide = pd.read_csv('LoughranMcDonald_MasterDictionary_2020.csv')
guide.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,AARDVARK,1,312,1.42205e-08,1.335201e-08,3.700747e-06,96,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.367356e-10,8.882163e-12,9.362849e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,4.102067e-10,1.200533e-10,5.359747e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,15,6.836779e-10,4.080549e-10,1.406914e-07,14,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,8009,3.650384e-07,3.798698e-07,3.523914e-05,1058,0,0,0,0,0,0,0,0,3,12of12inf


In [56]:
#Assigning +ve and -ve words based on the dictionary

In [57]:
guide.shape

(86531, 17)

In [62]:
positive = []
negative = []
uncertain = []

for index, row in guide.iterrows():
    if row['Positive'] > 0:
        positive.append(row['Word'].lower())
    if row['Negative'] > 0:
        negative.append(row['Word'].lower())
    if row['Uncertainty'] > 0:
        uncertain.append(row['Word'].lower())


In [63]:
df.head()

Unnamed: 0,File,text,Number of sentences
0,1,when people hear ai they often think about sen...,19
1,2,with increasing computing power and more data ...,20
2,3,if you were a fan of the s film clueless back ...,39
3,5,from the stone age to the modern world from hu...,44
4,6,artificial intelligence ai is the development ...,37


In [67]:
print(positive, "\n", negative, "\n", uncertain)


['able', 'abundance', 'abundant', 'acclaimed', 'accomplish', 'accomplished', 'accomplishes', 'accomplishing', 'accomplishment', 'accomplishments', 'achieve', 'achieved', 'achievement', 'achievements', 'achieves', 'achieving', 'adequately', 'advancement', 'advancements', 'advances', 'advancing', 'advantage', 'advantaged', 'advantageous', 'advantageously', 'advantages', 'alliance', 'alliances', 'assure', 'assured', 'assures', 'assuring', 'attain', 'attained', 'attaining', 'attainment', 'attainments', 'attains', 'attractive', 'attractiveness', 'beautiful', 'beautifully', 'beneficially', 'benefited', 'benefiting', 'benefitted', 'benefitting', 'best', 'better', 'bolstered', 'bolstering', 'bolsters', 'boom', 'booming', 'boost', 'boosted', 'breakthrough', 'breakthroughs', 'brilliant', 'charitable', 'collaborate', 'collaborated', 'collaborates', 'collaborating', 'collaboration', 'collaborations', 'collaborative', 'collaborator', 'collaborators', 'compliment', 'complimentary', 'complimented', '

In [68]:
def positivescore(text):
    score = 0
    global positive
    words = text.split()
    score = sum(1 for word in words if word in positive)
    return score

def negativescore(text):
    score = 0
    global negative
    words = text.split()
    score = sum(1 for word in words if word in negative)
    return score

In [69]:
df["Positive Score"] = df['text'].apply(positivescore)
df["Negative Score"] = df['text'].apply(negativescore)

In [70]:
df.head()

Unnamed: 0,File,text,Number of sentences,Positive Score,Negative Score
0,1,when people hear ai they often think about sen...,19,4,7
1,2,with increasing computing power and more data ...,20,8,6
2,3,if you were a fan of the s film clueless back ...,39,34,19
3,5,from the stone age to the modern world from hu...,44,21,17
4,6,artificial intelligence ai is the development ...,37,16,12


In [72]:
df['POLARITY SCORE'] = (df['Positive Score'] - df['Negative Score']) / (
    (df['Positive Score'] + df['Negative Score']) + 0.000001
)

df['WORD COUNT'] = df['text'].apply(lambda x: len(x.split()))

df['SUBJECTIVITY SCORE'] = (df['Positive Score'] + df['Negative Score']) / (
    df['WORD COUNT'] + 0.000001
)

df['AVG SENTENCE LENGTH'] = df['WORD COUNT'] / df['Number of sentences']

df['AVG NUMBER OF WORDS PER SENTENCE'] = df['WORD COUNT'] / df['Number of sentences']


In [73]:
df.head()

Unnamed: 0,File,text,Number of sentences,Positive Score,Negative Score,POLARITY SCORE,WORD COUNT,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,AVG NUMBER OF WORDS PER SENTENCE
0,1,when people hear ai they often think about sen...,19,4,7,-0.272727,726,0.015152,38.210526,38.210526
1,2,with increasing computing power and more data ...,20,8,6,0.142857,639,0.021909,31.95,31.95
2,3,if you were a fan of the s film clueless back ...,39,34,19,0.283019,1860,0.028495,47.692308,47.692308
3,5,from the stone age to the modern world from hu...,44,21,17,0.105263,1251,0.030376,28.431818,28.431818
4,6,artificial intelligence ai is the development ...,37,16,12,0.142857,1134,0.024691,30.648649,30.648649


In [75]:
# for average word length
def avg_word_length(text):
    words = text.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

In [78]:
## for seeing if the sentence has pronoun
def pronoun(text):
    pronouns = r"(\b(s?i|me|we|my|ours|us|I|Me|We|My|Ours|Us)\b)"
    result = 0

    matches = re.finditer(pronouns,text,re.MULTILINE)
    for nummatch,match in enumerate(matches):
        result+=1
    return result

In [81]:
df['AVG WORD LENGTH'] = df['text'].apply(avg_word_length)
df['AVG SENTENCE LENGTH'] = df['WORD COUNT'] / df['Number of sentences']
df['PERSONAL PRONOUNS'] = df['text'].apply(pronoun)


In [82]:
df['URL']=links['URL']
df.columns

Index(['File', 'text', 'Number of sentences', 'Positive Score',
       'Negative Score', 'POLARITY SCORE', 'WORD COUNT', 'SUBJECTIVITY SCORE',
       'AVG SENTENCE LENGTH', 'AVG NUMBER OF WORDS PER SENTENCE',
       'AVG WORD LENGTH', 'PERSONAL PRONOUNS', 'URL'],
      dtype='object')

In [85]:
df = df[['URL',
         'File',
         'text',
         'Number of sentences',
         'Positive Score',
         'Negative Score',
         'POLARITY SCORE',
         'WORD COUNT',
         'SUBJECTIVITY SCORE',
         'AVG SENTENCE LENGTH',
         'AVG NUMBER OF WORDS PER SENTENCE',
         'AVG WORD LENGTH',
         'PERSONAL PRONOUNS']]


In [86]:
df.head()

Unnamed: 0,URL,File,text,Number of sentences,Positive Score,Negative Score,POLARITY SCORE,WORD COUNT,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,AVG NUMBER OF WORDS PER SENTENCE,AVG WORD LENGTH,PERSONAL PRONOUNS
0,https://insights.blackcoffer.com/how-is-login-...,1,when people hear ai they often think about sen...,19,4,7,-0.272727,726,0.015152,38.210526,38.210526,5.038567,4
1,https://insights.blackcoffer.com/how-does-ai-h...,2,with increasing computing power and more data ...,20,8,6,0.142857,639,0.021909,31.95,31.95,5.250391,2
2,https://insights.blackcoffer.com/ai-and-its-im...,3,if you were a fan of the s film clueless back ...,39,34,19,0.283019,1860,0.028495,47.692308,47.692308,5.18172,13
3,https://insights.blackcoffer.com/how-do-deep-l...,5,from the stone age to the modern world from hu...,44,21,17,0.105263,1251,0.030376,28.431818,28.431818,4.907274,28
4,https://insights.blackcoffer.com/how-artificia...,6,artificial intelligence ai is the development ...,37,16,12,0.142857,1134,0.024691,30.648649,30.648649,5.319224,4
