In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

In [2]:
df = pd.read_excel('Output Data Structure1.xlsx')

In [3]:
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,,,,,,,,,,,,,
1,38,https://insights.blackcoffer.com/what-if-the-c...,,,,,,,,,,,,,
2,39,https://insights.blackcoffer.com/what-jobs-wil...,,,,,,,,,,,,,
3,40,https://insights.blackcoffer.com/will-machine-...,,,,,,,,,,,,,
4,41,https://insights.blackcoffer.com/will-ai-repla...,,,,,,,,,,,,,


In [6]:
df.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [7]:
df.shape

(114, 15)

## 1. Sentimental Analysis

In [8]:
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

def sentimental_analysis():
    read_files = glob.glob("StopWords\\*.txt")
    with open("stopwords.txt", "wb") as outfile:
        for f in read_files:
            with open(f, "rb") as infile:
                outfile.write(infile.read())
            
    # creating stop words list
    stop_word_list = open('stopwords.txt').read().split('\n')
    stop_word_list.sort()
    positive_words = {}
    negative_words = {}

    with open("MasterDictionary/negative-words.txt", "r") as f:
        negative_words = {word.strip(): 1 for word in f.readlines() if word.strip() not in stop_word_list}
    
    with open("MasterDictionary/positive-words.txt", "r") as f:
        positive_words = {word.strip(): 1 for word in f.readlines() if word.strip() not in stop_word_list}
    
    text_file = open(f'{a1}.txt', "r")
    data = text_file.read()
    text_file.close()
    #print(data)
    words = word_tokenize(data)
    #print(words)
    
    # Count the number of positive words in the text
    num_positive = sum([1 for token in words if token.lower() in positive_words])

    # Count the number of negative words in the text
    num_negative = sum([1 for token in words if token.lower() in negative_words])

    # Count the polarity score in the text
    polarity = (num_positive - num_negative)/ ((num_positive + num_negative) + 0.000001)
    
    # Count the number of non-stopwords in the text
    num_non_stopwords = sum([1 for token in words if token.lower() not in stop_word_list])

    # Count the subjective score in the text
    # subjectivity = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)
    subjectivity = (num_positive + num_negative)/ ((num_non_stopwords) + 0.000001)
    
    return num_positive, num_negative, polarity, subjectivity

## 2. Analysis of Readability

In [9]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from textstat import syllable_count

def analysis_of_readability():
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(data)

    # Tokenize the text into words
    words = nltk.word_tokenize(data)

    # Calculate the average sentence length
    num_sentences = len(sentences)
    num_words = len(words)
    avg_sentence_length = num_words / num_sentences

    # Tokenize the text into words using nltk and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    words = [word.lower() for word in tokenizer.tokenize(data) if word.lower() not in stop_words]

    # Find the complex words
    complex_words = set(word for word in words if syllable_count(word) > 2)

    # Calculate the percentage of complex words
    num_complex_words = sum([1 for word in words if word.lower() in complex_words])
    pct_complex_words = (num_complex_words / num_words) * 100

    # Calculate the Fog Index
    fog_index = 0.4 * (avg_sentence_length + pct_complex_words)
    
    return avg_sentence_length, pct_complex_words, fog_index

## 3. Average Number of Words Per Sentence

In [10]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

def average_number_of_words_per_sentence():
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(data)

    # Tokenize the text into words
    words = nltk.word_tokenize(data)

    # Calculate the average sentence length
    num_sentences = len(sentences)
    num_words = len(words)
    # Calculate the average number of words per sentence
    avg_num_words_per_sentence = num_words / num_sentences
    
    return avg_num_words_per_sentence

## 4. Complex Word Count

In [11]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from textstat import syllable_count

def complex_word_count():
    # Tokenize the text into words using nltk and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    words = [word.lower() for word in tokenizer.tokenize(data) if word.lower() not in stop_words]

    # Find the complex words
    complex_words = set(word for word in words if syllable_count(word) > 2)

    # Count the complex word count from the text
    num_complex_words = sum([1 for word in words if word.lower() in complex_words])
    
    return num_complex_words

## 5. Word Count

In [12]:
import nltk
import string

def words_count():
    words = nltk.word_tokenize(data)

    # Remove stop words and punctuation marks from the words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]

    # Count the remaining cleaned words
    num_cleaned_words = len(cleaned_words)
    
    return num_cleaned_words

## 6. Syllable Count Per Word

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from textstat import syllable_count

def syllable_count_per_word():
    # Tokenize the text into words using nltk and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    words = [word.lower() for word in tokenizer.tokenize(data) if word.lower() not in stop_words]

    # function to count the number of syllables in a word
    def count_syllables(word):
        # Remove trailing "es" or "ed"
        if word.endswith("es") or word.endswith("ed"):
            word = word[:-2]
        # Count the number of vowels
        count = 0
        for char in word:
            if char in "aeiouy":
                count += 1
        return count

    # Count the syllables in each word
    syllable_counts = len([count_syllables(word) for word in words])
    
    return syllable_counts

## 7. Personal Pronouns

In [14]:
import re

def personal_pronouns():
    # Define the regular expression pattern to match the personal pronouns
    pattern = r'\b(I|we|my|ours|us)\b'

    # Use the regular expression pattern to find the counts of the personal pronouns
    matches = re.findall(pattern, data)

    # Remove any matches that are the country name "US"
    num_pronouns = len([match for match in matches if match != 'US'])
    
    return num_pronouns

## 8. Average Word Length

In [15]:
import string
from nltk.tokenize import word_tokenize

def average_word_length(data):
    # Tokenize the text into words
    words = word_tokenize(data)

    # Remove any punctuation from the words
    words = [word.strip(string.punctuation) for word in words]

    # Calculate the total number of characters in each word
    total_chars = sum(len(word) for word in words)

    # Calculate the total number of words
    total_words = len(words)

    # Calculate the average word length
    avg_word_length = total_chars / total_words
    
    return float(avg_word_length)

In [17]:
for i in range(114):
    try:
        url = urlopen(df['URL'][i])
        page = url.read()
        html = bs(page.decode('utf-8', 'ignore'), 'html.parser')
        title = html.find("h1" ,{"class":"entry-title"}).text
        content = html.find("div",{"class":"td-post-content"})
        para = content.find_all("p")
        a1 = df['URL_ID'][i]
        f = open(f'{a1}.txt',"w")
        f.write(title)
        f.write('\n')
        for _ in range(len(para)):
            f.write(para[_].text)
            f.write('\n')
        f.close()
    
        text_file = open(f'{a1}.txt', "r")
        data = text_file.read()
        text_file.close()
    
        num_positive, num_negative, polarity, subjectivity = sentimental_analysis()
        df.loc[i,'POSITIVE SCORE'] = num_positive
        df.loc[i,'NEGATIVE SCORE'] = num_negative
        df.loc[i,'POLARITY SCORE'] = polarity
        df.loc[i,'SUBJECTIVITY SCORE'] = subjectivity
    
        avg_sentence_length, pct_complex_words, fog_index = analysis_of_readability()
        df.loc[i,'AVG SENTENCE LENGTH'] = avg_sentence_length
        df.loc[i,'PERCENTAGE OF COMPLEX WORDS'] = pct_complex_words
        df.loc[i,'FOG INDEX'] = fog_index
    
        avg_num_words_per_sentence = average_number_of_words_per_sentence()
        df.loc[i,'AVG NUMBER OF WORDS PER SENTENCE'] = avg_num_words_per_sentence
    
        num_complex_words = complex_word_count()
        df.loc[i,'COMPLEX WORD COUNT'] = num_complex_words
    
        num_cleaned_words = words_count()
        df.loc[i,'WORD COUNT'] = num_cleaned_words
    
        syllable_counts = syllable_count_per_word()
        df.loc[i,'SYLLABLE PER WORD'] = syllable_counts
    
        num_pronouns = personal_pronouns()
        df.loc[i,'PERSONAL PRONOUNS'] = num_pronouns
    
        avg_word_length = average_word_length(data)
        df.loc[i,'AVG WORD LENGTH'] = avg_word_length
        
    except Exception as e:
        print(f"Error opening URL {df['URL_ID'][i]} {df['URL'][i]}: {e}")

Error opening URL 44 https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: HTTP Error 404: Not Found
Error opening URL 51 https://insights.blackcoffer.com/future-of-work-how-ai-has-entered-the-workplace/: 'NoneType' object has no attribute 'text'
Error opening URL 57 https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: HTTP Error 404: Not Found
Error opening URL 84 https://insights.blackcoffer.com/how-small-business-can-survive-the-coronavirus-crisis/: 'charmap' codec can't encode character '\u20b9' in position 41: character maps to <undefined>
Error opening URL 87 https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-tourism-aviation-industries/: 'charmap' codec can't encode character '\u20b9' in position 212: character maps to <undefined>
Error opening URL 91 https://insights.blackcoffer.com/human-rights-outlook/: 'NoneType' object has no attribute 'text'
Error opening URL 92 https://insights.blackcof

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   URL_ID                            114 non-null    int64  
 1   URL                               114 non-null    object 
 2   POSITIVE SCORE                    97 non-null     float64
 3   NEGATIVE SCORE                    97 non-null     float64
 4   POLARITY SCORE                    97 non-null     float64
 5   SUBJECTIVITY SCORE                97 non-null     float64
 6   AVG SENTENCE LENGTH               97 non-null     float64
 7   PERCENTAGE OF COMPLEX WORDS       97 non-null     float64
 8   FOG INDEX                         97 non-null     float64
 9   AVG NUMBER OF WORDS PER SENTENCE  97 non-null     float64
 10  COMPLEX WORD COUNT                97 non-null     float64
 11  WORD COUNT                        97 non-null     float64
 12  SYLLABLE

In [19]:
df.fillna(0, inplace=True)

In [21]:
df.head(80)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,68.0,34.0,0.333333,0.081535,26.640000,18.068068,17.883227,26.640000,361.0,1153.0,1163.0,1.0,5.065065
1,38,https://insights.blackcoffer.com/what-if-the-c...,60.0,38.0,0.224490,0.115023,20.487500,10.982306,12.587923,20.487500,180.0,771.0,758.0,6.0,4.176937
2,39,https://insights.blackcoffer.com/what-jobs-wil...,65.0,38.0,0.262136,0.093297,22.470588,17.591623,16.024885,22.470588,336.0,1001.0,995.0,2.0,4.795288
3,40,https://insights.blackcoffer.com/will-machine-...,64.0,27.0,0.406593,0.101336,19.293478,12.619718,12.765279,19.293478,224.0,887.0,883.0,17.0,4.327887
4,41,https://insights.blackcoffer.com/will-ai-repla...,59.0,23.0,0.439024,0.076779,25.103896,13.605794,15.483876,25.103896,263.0,985.0,979.0,12.0,4.481117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,112,https://insights.blackcoffer.com/how-will-covi...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
76,113,https://insights.blackcoffer.com/coronavirus-i...,17.0,21.0,-0.105263,0.082251,26.088235,13.979707,16.027177,26.088235,124.0,460.0,453.0,7.0,4.536640
77,114,https://insights.blackcoffer.com/estimating-th...,48.0,48.0,0.000000,0.102455,24.971014,12.362159,14.933269,24.971014,213.0,876.0,867.0,2.0,4.403947
78,115,https://insights.blackcoffer.com/covid-19-how-...,21.0,39.0,-0.300000,0.060241,27.245902,15.643803,17.155882,27.245902,260.0,934.0,943.0,2.0,4.619134


In [22]:
df.to_excel("Output Data Structure.xlsx")