## Data Extraction and Text Analysis (Subhradeep Pal)

In [1]:
import numpy as np
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import spacy
from spacy_syllables import SpacySyllables
import nltk
from nltk.corpus import stopwords

In [2]:
#Reading the input file

df_url = pd.read_excel('C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\Input.xlsx')

In [3]:
df_url

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...
110,147,https://insights.blackcoffer.com/the-future-of...
111,148,https://insights.blackcoffer.com/big-data-anal...
112,149,https://insights.blackcoffer.com/business-anal...


### Data Extraction

In [4]:
#Web scraping the text data from each and every sites mentioned in the input file using beautifulsoup and requests library

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for i,j in enumerate(list(df_url['URL'])):
    f=open("C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\files\\"+str(df_url.iloc[i:i+1,:1]['URL_ID'][i])+".txt","a")
    page = requests.get(j,headers=headers)
    page_soup = BeautifulSoup(page.text,'lxml')
    try:
        title = page_soup.find('h1',class_='entry-title')
        f.write(title.text)
        body = page_soup.find('div',class_='td-post-content')
        page_content = body.find_all('p')
        for p in page_content:
            f.write(p.text)
        f.close()
    except:
        f.write('No text could be found.')
        continue

### Reading stopwords, positive and negative words from the text files

In [5]:
#Reading the stopwords from the files provided

stopword_files = ['Auditor','Currencies','DatesandNumbers','Generic','GenericLong','Geographic','Names']
stopword_list =[]
for file in stopword_files:
    f = open('C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\StopWords-20221016T122610Z-001\\StopWords\\StopWords_'+file+'.txt','r')
    stopword_list.extend(f.readlines())

In [6]:
#Removing the newline characters from the stopwords using replace function

char_to_replace = {'\n':''}
for key,value in char_to_replace.items():
    for i in range(len(stopword_list)):
        stopword_list[i] = stopword_list[i].lower()
        stopword_list[i] = stopword_list[i].replace(key,value)

In [7]:
#Reading the list of positive and negative words provided

pos_words=[]
neg_words=[]
word_files=['positive-words','negative-words']
for file in word_files:
    fop = open('C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\MasterDictionary-20221016T122559Z-001\\MasterDictionary\\'+file+'.txt','r')
    if file == 'positive-words':
        pos_words.extend(fop.readlines())
    else:
        neg_words.extend(fop.readlines())

In [8]:
#Removing newline characters

for i in range(len(pos_words)):
    pos_words[i]= pos_words[i].replace('\n','')
    
for i in range(len(neg_words)):
    neg_words[i]= neg_words[i].replace('\n','')

### Data Analysis

In [9]:
#Declaring empty lists

positive_score =[]
negative_score =[]
polarity_score = []
subjectivity_score =[]
avg_len_sent = []
complex_words = []
percent_of_cw =[]
fog_index = []
avg_syllable_count = []
pper_count = []
word_count = []
avg_word_len = []

#Opening each and every text file created previously in read mode

for i in range(df_url.shape[0]):
    file = open('C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\files\\'+str(df_url.iloc[i:i+1,:1]['URL_ID'][i])+'.txt','r')
    lines = file.readlines()

#Removing unnecessary punctuations and strings

    punc_to_replace = {',':' ','?':' ','(':' ',')':' ','’':' ','“':' ','”':' ','\xa0':' ','  ':' '}
    for key, value in punc_to_replace.items():
        lines[0]=lines[0].lower()
        lines[0]=lines[0].replace(key,value)

#loading spacy english model and adding syllables component to the pipeline after tagger component 

    nlp = spacy.load("en_core_web_md")
    nlp.add_pipe("syllables",after='tagger')
    doc = nlp(lines[0])

#Determining number of words in the text data after removing the punctuations and removing the stopwords provided from the data

    word_lemmas = []
    for token in doc:
        word_lemmas.append(token.lemma_)
    num_words = len(word_lemmas)
    
    for token in word_lemmas:
        if token in stopword_list:
            word_lemmas.remove(token)
        else:
            continue

#Calculating POSITIVE , NEGATIVE, SUBJECTIVITY ,POLARITY SCORE & AVERAGE SENTENCE LENGTH

    p=0
    n=0
    for word in word_lemmas:
        if word in pos_words:
            p=p+1
        elif word in neg_words:
            n=n+1
        else:
            continue
    #n = n*(-1)
    positive_score.append(p)
    negative_score.append(n)
    
    polarity = (p-n)/((p+n)+0.000001)
    polarity = round(polarity,2)
    polarity_score.append(polarity)
    
    subjectivity = (p+n)/(len(word_lemmas)+0.000001)
    subjectivity = round(subjectivity,2)
    subjectivity_score.append(subjectivity)
    
    Avg_sent_len = num_words/len(list(doc.sents))
    Avg_sent_len = round(Avg_sent_len,2)
    avg_len_sent.append(Avg_sent_len)
    
#Calculating PERCENTAGE OF COMPLEX WORDS & COMPLEX WORD COUNT in the text data
    
    cw=0
    for token in doc:
        syllable_count = token._.syllables_count
        if syllable_count != None and syllable_count >= 3:
            cw = cw +1
        else:
            continue
    complex_words.append(cw)        
    
    percentage_cw = (cw/num_words)*100
    percentage_cw = round(percentage_cw,2)
    percent_of_cw.append(percentage_cw)
    
#Determing FOG INDEX
    
    fog_i = 0.4*(Avg_sent_len+cw)
    fog_i = round(fog_i,2)
    fog_index.append(fog_i)

#Determning the average syllable count in the text data    
    
    count = 0
    for token in doc:
        syllable_c = token._.syllables_count
        if syllable_c != None:
            count= syllable_c + count
        else:
            continue
    
    avg_syllable_count_word = count/num_words
    avg_syllable_count_word = round(avg_syllable_count_word,2)
    avg_syllable_count.append(avg_syllable_count_word)

#Determing the number of PERSONAL PRONOUNS in the text data    
    
    prp = 0
    for token in doc:
        if token.tag_ == 'PRP':
            prp = prp +1
        else:
            continue
    pper_count.append(prp)

#Calculating the WORD COUNT after removing stopwords using nltk stopwords   
    
    word_list = []
    for token in doc:
        word_list.append(token.text)
    
    for word in word_list:
        if word in stopwords.words('english'):
            word_list.remove(word)
        else:
            continue
    word_count.append(len(word_list))
 
#Calculating AVERAGE WORD LENTGH in the text data

    char_list=[]
    for token in doc: 
        char_list.extend(list(token.text))
    char_count = len(char_list)
    char_per_word = char_count/num_words
    char_per_word = round(char_per_word,2)
    avg_word_len.append(char_per_word)
        

In [10]:
url = []
for i in range(df_url.shape[0]):
    url.append(df_url.iloc[i:i+1,1:2]['URL'][i])

In [11]:
url_id = range(37,151)
columns = ['URL_ID','URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS',
           'FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','AVERAGE WORD LENGTH']



df_new = pd.DataFrame(list(zip(url_id,url,positive_score,negative_score,polarity_score,subjectivity_score,avg_len_sent,
                              percent_of_cw,fog_index,avg_len_sent,complex_words,word_count,avg_syllable_count,pper_count,avg_word_len))
                      ,columns=columns)




In [12]:
df_new

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVERAGE WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,76,35,0.37,0.09,34.13,18.59,153.25,34.13,349,1357,1.64,21,5.43
1,38,https://insights.blackcoffer.com/what-if-the-c...,74,39,0.31,0.12,24.75,11.59,79.90,24.75,175,1021,1.40,53,4.57
2,39,https://insights.blackcoffer.com/what-jobs-wil...,68,40,0.26,0.09,26.48,18.55,142.19,26.48,329,1238,1.62,29,5.21
3,40,https://insights.blackcoffer.com/will-machine-...,62,29,0.36,0.09,24.22,13.05,96.89,24.22,218,1141,1.42,53,4.64
4,41,https://insights.blackcoffer.com/will-ai-repla...,62,28,0.38,0.08,25.82,14.08,113.53,25.82,258,1270,1.47,56,4.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,9,21,-0.40,0.09,24.48,10.12,30.59,24.48,52,367,1.34,18,4.52
110,147,https://insights.blackcoffer.com/the-future-of...,39,15,0.44,0.07,30.33,14.03,78.53,30.33,166,844,1.46,20,4.86
111,148,https://insights.blackcoffer.com/big-data-anal...,7,3,0.40,0.11,16.88,15.56,15.15,16.88,21,92,1.46,1,4.61
112,149,https://insights.blackcoffer.com/business-anal...,8,1,0.78,0.09,36.50,21.23,27.00,36.50,31,100,1.74,1,5.58


In [13]:
df_new.to_csv('C:\\Users\\subhr\\Desktop\\Internship\\Black Coffer\\Output_SubhradeepPal.csv')