# DATA CRAWLING

### Required Dependencies

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import math
import os

### Importing Excel file 

In [2]:
df  = pd.read_excel("Input.xlsx")
df

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...
110,147,https://insights.blackcoffer.com/the-future-of...
111,148,https://insights.blackcoffer.com/big-data-anal...
112,149,https://insights.blackcoffer.com/business-anal...


In [3]:
print(df['URL'][0])

https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/


### Web Scraping

In [4]:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}

In [5]:
def urltotext(url):
    paragraphs=[]
    data = requests.get(url,headers=headers).text
    soup = BeautifulSoup(data,'lxml')
    
    title = ''
    if soup.find_all('h1'):
        title=soup.find_all('h1')[0].text.strip()
    
    p=soup.find_all('p')
    for paragraph in p:
        paragraph=paragraph.text.strip()
        paragraphs.append(paragraph)
    text=''.join(paragraphs)
    text=title+" "+text
    return text

df['text']=df['URL'].apply(urltotext)

### Saving Text in Articles Folder

In [6]:
for i in df.index:
    filename=str(df['URL_ID'][i])
    text=str(df['text'][i])
    path='articles/'+filename+'.txt'
    
    if not os.path.exists(path):
        text_file = open(path,'w',encoding='utf-8')
        text_file.write(text)
        text_file.close()

### Extracting Positive and Negative Words from the Text Files

In [7]:
with open('positive-words.txt','r') as f:
    positivewords=f.read()

In [8]:
positive_words=positivewords.split('\n')

In [9]:
positive_words

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation',
 'accolade',
 'accolades',
 'accommodative',
 'accomodative',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accomplishments',
 'accurate',
 'accurately',
 'achievable',
 'achievement',
 'achievements',
 'achievible',
 'acumen',
 'adaptable',
 'adaptive',
 'adequate',
 'adjustable',
 'admirable',
 'admirably',
 'admiration',
 'admire',
 'admirer',
 'admiring',
 'admiringly',
 'adorable',
 'adore',
 'adored',
 'adorer',
 'adoring',
 'adoringly',
 'adroit',
 'adroitly',
 'adulate',
 'adulation',
 'adulatory',
 'advanced',
 'advantage',
 'advantageous',
 'advantageously',
 'advantages',
 'adventuresome',
 'adventurous',
 'advocate',
 'advocated',
 'advocates',
 'affability',
 'affable',
 'affably',
 'affectation',
 'affection',
 'affectionate',
 'affinity',
 'affirm',
 'affirmation',
 'affirmative',
 'affluence',
 'affluent',
 'afford',
 'affordable',
 'af

In [10]:
with open('negative-words.txt','r') as f1:
    negativewords=f1.read()

In [11]:
negative_words=negativewords.split('\n')

In [12]:
negative_words

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',


### Extracting Stop Words from the Text File

In [13]:
with open("StopWords_DatesandNumbers.txt",'r') as f:
    stopwordsdateandnumbers=f.read()
    stop_words_datendnumbers_=stopwordsdateandnumbers.split('\n')

In [14]:
stop_words_dateandnumbers=[]
for i in stop_words_datendnumbers_:
    if "|" in i:
           stop_words_dateandnumbers.append(i.split('|')[0].strip())
    else:
        stop_words_dateandnumbers.append(i)
        

In [15]:
stop_words = stop_words_dateandnumbers
stop_words

['HUNDRED',
 'THOUSAND',
 'MILLION',
 'BILLION',
 'TRILLION',
 'DATE',
 'ANNUAL',
 'ANNUALLY',
 'ANNUM',
 'YEAR',
 'YEARLY',
 'QUARTER',
 'QUARTERLY',
 'QTR',
 'MONTH',
 'MONTHLY',
 'WEEK',
 'WEEKLY',
 'DAY',
 'DAILY',
 'JANUARY',
 'FEBRUARY',
 'MARCH',
 'APRIL',
 'MAY',
 'JUNE',
 'JULY',
 'AUGUST',
 'SEPTEMBER',
 'OCTOBER',
 'NOVEMBER',
 'DECEMBER',
 'JAN',
 'FEB',
 'MAR',
 'APR',
 'MAY',
 'JUN',
 'JUL',
 'AUG',
 'SEP',
 'SEPT',
 'OCT',
 'NOV',
 'DEC',
 'MONDAY',
 'TUESDAY',
 'WEDNESDAY',
 'THURSDAY',
 'FRIDAY',
 'SATURDAY',
 'SUNDAY',
 'ONE',
 'TWO',
 'THREE',
 'FOUR',
 'FIVE',
 'SIX',
 'SEVEN',
 'EIGHT',
 'NINE',
 'TEN',
 'ELEVEN',
 'TWELVE',
 'THIRTEEN',
 'FOURTEEN',
 'FIFTEEN',
 'SIXTEEN',
 'SEVENTEEN',
 'EIGHTEEN',
 'NINETEEN',
 'TWENTY',
 'THIRTY',
 'FORTY',
 'FIFTY',
 'SIXTY',
 'SEVENTY',
 'EIGHTY',
 'NINETY',
 'FIRST',
 'SECOND',
 'THIRD',
 'FOURTH',
 'FIFTH',
 'SIXTH',
 'SEVENTH',
 'EIGHTH',
 'NINTH',
 'TENTH',
 'I',
 'II',
 'III',
 'IV',
 'V',
 'VI',
 'VII',
 'VIII',
 'IX',


### Calculating the Positive Score:-

In [16]:
def positive_score(text):
    score=0
    for i in text.split():
        if i.upper() in stop_words:#since all values in stop_words are in upper case
            continue
        if i.lower() in positive_words: #since all values in positive_words list are in lower case
            score+=1
    return score

df['POSITIVE SCORE']=df.text.apply(positive_score)

### Calculating the Negative Score:-

In [17]:
def negative_score(text):
    score=0
    
    for i in text.split():
        if i.upper() in stop_words:#since all values in stop_words are in upper case
            continue
        if i.lower() in negative_words: #since all values in negative_words list are in lower case
            score+=1
    return score

df['NEGATIVE SCORE']=df.text.apply(negative_score)

### Calculating the Polarity Score:-

In [18]:
df['POLARITY SCORE']=(df['POSITIVE SCORE']-df['NEGATIVE SCORE'])/(df['POSITIVE SCORE']+df['NEGATIVE SCORE']+(0.00001))
    

In [19]:
def total_words_after_cleaning(text):
    score=0
    for i in text.split():
        if i.upper() in stop_words:#since all values in stop_words are in upper case
            continue
        score+=1
    
    return score
    
df['WORD COUNT']=df.text.apply(total_words_after_cleaning)

### Calculating the Average Sentence Length:-

In [20]:
df['SUBJECTIVITY SCORE']=(df['POSITIVE SCORE']+df['NEGATIVE SCORE'])/(df['WORD COUNT']+0.000001)

In [21]:
def avsentlen(text):
    asl=len(text.split())/len(text.split("."))
    return math.floor(asl)

df['AVERAGE SENTENCE LENGTH']=df.text.apply(avsentlen)

In [22]:
def syllable_count(word):
    sc=0
    for w in word:
        if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u' or w=='A' or w=='E' or w=='I' or w=='O' or w=='U'):
            sc+=1
    if word[-2:]=='es' and  word[-2:]=='ES' and  word[-2:]=='ed' and  word[-2:]=='ED':
        sc=sc-2
    return sc



### Calculating the Percentage Complex Word Count:-

In [23]:
def complex_word_count(text):
    complex_word_count=0
    for word in text.split(" "):
        sc=syllable_count(word)
        if sc>=2:
            complex_word_count+=1
    return complex_word_count

df['COMPLEX_WORD_COUNT']=df.text.apply(complex_word_count)    

In [24]:
df['PERCENTAGE OF COMPLEX WORDS']=(df['COMPLEX_WORD_COUNT']/df['WORD COUNT'])*100

### Calculating the Average Syllables Count Per Words:-

In [25]:
def syllable_count_perword(text):
    syllable_count_list=[]
    for word in text.split(' '):
        if word in stop_words:
            continue
        else:
            sc=syllable_count(word)
            syllable_count_list.append(sc)
        
    scpw=np.mean(syllable_count_list)
    return scpw

df['AVERAGE SYLLABLE COUNT PER WORD']=df.text.apply(syllable_count_perword)

### Calculating the Personal Pronouns:-

In [26]:
perpronoun=['I',"we","my","ours","us"]

In [27]:
def perpronoun_count(text):
    pprc=0
    for word in text.split(' '):
        if word.lower() in perpronoun or word in perpronoun:
            if word=='US':
                continue
            else:
                pprc+=1
    return pprc

df['PERSONAL PRONOUNS']=df.text.apply(perpronoun_count)

### Calculating the Average Word Length:-

In [28]:
def wordlengthaverage(text):
    word_len=[]
    for word in text.split(' '):
        word_len.append(len(word))
    wla=np.mean(word_len)
    return wla

df['AVG WORD LENGTH']=df.text.apply(wordlengthaverage)

### Calculating the Fog Index:-

In [29]:
df['FOG INDEX']=0.4*(df['AVERAGE SENTENCE LENGTH']+df['PERCENTAGE OF COMPLEX WORDS'])

### Calculating the Average Number of Words Per Sentence:-

In [30]:
df['AVG NUMBER OF WORDS PER SENTENCE']=df['AVERAGE SENTENCE LENGTH']

In [31]:
df.columns

Index(['URL_ID', 'URL', 'text', 'POSITIVE SCORE', 'NEGATIVE SCORE',
       'POLARITY SCORE', 'WORD COUNT', 'SUBJECTIVITY SCORE',
       'AVERAGE SENTENCE LENGTH', 'COMPLEX_WORD_COUNT',
       'PERCENTAGE OF COMPLEX WORDS', 'AVERAGE SYLLABLE COUNT PER WORD',
       'PERSONAL PRONOUNS', 'AVG WORD LENGTH', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE'],
      dtype='object')

In [51]:
output=df.drop(['text'],axis=1)

In [52]:
output.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'WORD COUNT', 'SUBJECTIVITY SCORE', 'AVERAGE SENTENCE LENGTH',
       'COMPLEX_WORD_COUNT', 'PERCENTAGE OF COMPLEX WORDS',
       'AVERAGE SYLLABLE COUNT PER WORD', 'PERSONAL PRONOUNS',
       'AVG WORD LENGTH', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE'],
      dtype='object')

### To align with the current output file, the output structure is being loaded. 

In [53]:
given_output_file=pd.read_excel('Output Data Structure.xlsx') 

In [54]:
given_output_file.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [55]:
final_output=output[['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVERAGE SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX_WORD_COUNT', 'WORD COUNT',
       'AVERAGE SYLLABLE COUNT PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']]

In [56]:
final_output.columns=['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [57]:
final_output

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,76,26,0.490196,0.053600,23,61.324225,33.729690,23,1167,1903,2.243074,0,5.963408
1,38,https://insights.blackcoffer.com/what-if-the-c...,60,28,0.363636,0.056701,21,51.224227,28.889691,21,795,1552,1.953846,6,5.210256
2,39,https://insights.blackcoffer.com/what-jobs-wil...,69,32,0.366337,0.055617,20,58.039648,31.215859,20,1054,1816,2.165848,2,5.697218
3,40,https://insights.blackcoffer.com/will-machine-...,63,14,0.636364,0.044924,19,53.617270,29.046908,19,919,1714,1.965418,17,5.149856
4,41,https://insights.blackcoffer.com/will-ai-repla...,62,19,0.530864,0.044653,21,53.638368,29.855347,21,973,1814,1.998364,13,5.419302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,29,23,0.115385,0.051030,17,57.016683,29.606673,17,581,1019,2.117647,9,5.844055
110,147,https://insights.blackcoffer.com/the-future-of...,37,9,0.608696,0.037217,21,56.067961,30.827184,21,693,1236,2.077662,2,5.526821
111,148,https://insights.blackcoffer.com/big-data-anal...,33,36,-0.043478,0.054545,16,53.280632,27.712253,16,674,1265,2.065303,2,5.461841
112,149,https://insights.blackcoffer.com/business-anal...,31,2,0.878788,0.046025,23,56.345886,31.738354,23,404,717,2.275714,0,6.177143


### Saving the Output Excel File

In [58]:
file_name = 'Output.xlsx'
final_output.to_excel(file_name)