Importing Required Libraries.

In [1]:
import pandas as pd 
import requests
import re
import nltk
import numpy as np
import openpyxl
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns


Read the excel file using pandas

In [2]:
xlsx = pd.read_excel('Input.xlsx')

Here, 
There are Two classes contains DataExtraction from Web page and Text Analysis in Next Class

In [3]:
#Extract Data from Web page contains title and Content
class DataExtract:
    def __init__(self,index):
        self.index = index
    #Extracting text from url with index
    def extract_text_from_url(self):
        url = xlsx['URL'][self.index]
        page = requests.get(url)
        soup = BeautifulSoup(page.content,'html.parser')
        Title = soup.find('h1')
        paragraphs = soup.find_all('p')
        text_only = [p.get_text(strip=True) for p in paragraphs]
        text = text_only[16:-2]
        Raw_text = ''.join(text)
        return Title,Raw_text


#TextData Analsysis Class
class TextAnalysis:
    def __init__(self,StopWords,Postive,Negitive,Raw_text):
        self.StopWords = StopWords
        self.Postive = Postive
        self.Negitive = Negitive
        self.Raw_text  = Raw_text
 
    #Cleaning Data removing .?'..etc and lowering the text
    def Cleaning(self):
        clean_text = re.sub(r'\s+', ' ', self.Raw_text)
        clean_text = re.sub(r'\n+', '\n', clean_text) 
        clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text.lower())
        return clean_text
    
    #Removal of Stopwords that present in Stopwords folder.
    def StopWords_removal(self):
        #StopWords already extracted and in list of StopWords variable
        clean_text = self.Cleaning()
        Filtered_Words = [word for word in clean_text.split() if word.lower() not in self.StopWords]
        
        return Filtered_Words
    
    #Finding Postive and Negitive Words
    def Words_Behavior(self,Positive,Negitive):
        positive_words = list()
        negitive_words = list()
        filtered_words = self.StopWords_removal()
        for word in filtered_words:
            if word in Positive:
                positive_words.append(word)
            elif word in Negitive:
                negitive_words.append(word)    

        positive_words = nltk.word_tokenize(' '.join(positive_words))
        negitive_words =nltk.word_tokenize(' '.join(negitive_words))
        return positive_words,negitive_words
    
    #Step Two predict Words Score
    def Scores(self):
        positive_words,negitive_words = self.Words_Behavior(self.Postive,self.Negitive)
        filtered_words = self.StopWords_removal()
        postive_score = np.ones((1,len(positive_words))).astype('int')
        negitive_score  = np.ones((1,len(negitive_words))).astype('int') *-1
        positive_df = pd.DataFrame({'positive_words': positive_words, 'Score': postive_score[0]})
        negitive_df = pd.DataFrame({'negitive_words': negitive_words, 'Score': negitive_score[0]})
        Postive_words_Score = positive_df['Score'].sum()
        Negitive_words_Score = negitive_df['Score'].sum()
        Polarity_Score = ((positive_df['Score'].sum())-(-negitive_df['Score'].sum())) /((positive_df['Score'].sum()+negitive_df['Score'].sum()) + 0.000001)
        Subjective_Score =((positive_df['Score'].sum())+(-(negitive_df['Score'].sum()))) / (len(filtered_words) + 0.000001)
        return Postive_words_Score,Negitive_words_Score,Polarity_Score,Subjective_Score
    
    #text data extraction
    def text(self):
        Raw_text =''.join(self.Raw_text)
        sentences = re.split(r'\.+\s*', Raw_text)
        words = [word for sentence in sentences for word in sentence.split()]
        return sentences, words
    
    #Syllable Count
    def syllable_(self):
        sentences,_ = self.text()
        def count_syllables(word):

            # Remove common suffixes that do not contribute to syllable count
            word = re.sub(r'(es|ed|e$)', '', word, flags=re.IGNORECASE)
    
            # Count vowels (excluding consecutive vowels)
            vowels = re.findall(r'[aeiouy]+', word, flags=re.IGNORECASE)
    
            # Adjust for words starting with 'y' as it can be a consonant or a vowel
            if word and word[0].lower() == 'y' and not vowels:
                return 1
            else:
                return len(vowels)

        def count_syllables_per_word(text):
            # Tokenize the text into words
            words = re.findall(r'\b\w+\b', text)
    
            # Count syllables for each word
            syllable_counts = [count_syllables(word) for word in words]
    
            return syllable_counts
        text = ''.join(sentences)
        syllable_counts = count_syllables_per_word(text)

        # Create a dataframe from the word and syllable count lists
        data = {'Word': re.findall(r'\b\w+\b', text), 'Syllable Value': np.array(syllable_counts)}
        syllables_df = pd.DataFrame(data)
        return syllables_df,data
    

    # Analysis Readability Index
    def Analysis_of_Readability(self):
        sentences,words = self.text()
        syllables_df,_ = self.syllable_()
        Average_sentence_length = (len(words))/(len(sentences))
        Percentage_of_Complex_Words = ((len(syllables_df[syllables_df['Syllable Value']>2]))/(len(words))) *100
        Fog_Index = 0.4 * (Average_sentence_length+Percentage_of_Complex_Words)
        return Average_sentence_length,Percentage_of_Complex_Words,Fog_Index
    

    #Average Number of Sentence
    def Avg_sen(self):
        sentences,words = self.text()
        Average_number_words_per_sentence = (len(words))/(len(sentences))
        return Average_number_words_per_sentence
    

    #Complex Word Count
    def Complex_count(self):
        Syllables_df,_ = self.syllable_()
        Df = Syllables_df[Syllables_df['Syllable Value']>2]
        return Df
    

    #Word Count
    def Word_count(self):
        #Get filtered words from stopwords removal fun
        filtered_words = self.StopWords
        return len(filtered_words)
    

    #Syllable Count per word
    def Syllable_Count(self):
        _,data= self.syllable_()
        return data
    

    #Personal Pronouns
    def Personal_Pronouns(self):
        Pronoun_list = ["I","we", "my", "ours","us"]
        sentences,_ = self.text()
        def personal_pronouns(sentences):
            #creating regex 
            reg = re.compile(r'\b(?:'+'|'.join(Pronoun_list) + r')\b')
            #matching words
            match = re.findall(reg, ''.join(sentences))
            #counting all reg
            count = len(match)
            return count
        
        return personal_pronouns(sentences)


    #Average Word Length
    def Wordlen(self):
        _,words = self.text()
        Wordlength_df = pd.DataFrame({"words":words, "length_of_word":[len(word) for word in words]})
        Avg_len = (Wordlength_df['length_of_word'].sum())/(len(Wordlength_df['words']))
        return Avg_len

Opening stopwords folder files with different text baises

In [4]:
with open("StopWords_.txt", 'w') as w:
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_Currencies.txt",'r',encoding='latin-1') as f1:
        w.write(f1.read())
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_DatesandNumbers.txt", 'r',) as f2:
        w.write(f2.read())
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_Geographic.txt", 'r') as f3:
        w.write(f3.read())

In [5]:
with open("StopWords.txt", 'w') as w:
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_Auditor.txt",'r') as f1:
        w.write(f1.read())
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_Generic.txt", 'r') as f2:
        w.write(f2.read())
    with open("StopWords-20240110T154020Z-001/StopWords/StopWords_GenericLong.txt", 'r') as f3:
        w.write(f3.read())
    with open('StopWords-20240110T154020Z-001/StopWords/StopWords_Names.txt', 'r') as f4:
        w.write(f4.read())
    

In [6]:
with open('StopWords_.txt','r') as f:
    stopwords_spc = ''.join(f.readlines()).lower()
stopwords_spc_list = [line.split('|')[0].strip().lower() for line in stopwords_spc.split('\n') if line]

with open('StopWords.txt', 'r') as r:
    stop_words_two = [line.strip().lower() for line in r.readlines()]

stopwords = stop_words_two + stopwords_spc_list


From Master Dictionary folder extracting postive and negitive words

In [7]:
with open('MasterDictionary-20240110T154020Z-001/MasterDictionary/positive-words.txt','r') as r:
    positive = [line.strip().lower() for line in r.readlines()]
with open('MasterDictionary-20240110T154020Z-001/MasterDictionary/negative-words.txt','r',encoding='latin-1') as r:
    negitive= [line.strip().lower() for line in r.readlines()]

In [8]:
len(xlsx['URL'])

100

Running all URLS at a time to perform Text-Analysis and Data Extraction 

### IT Takes minimum 25 mins(because of Web Crawling)

In [9]:
Output_list = []
for i in range(0,len(xlsx['URL'])):
        Raw_text = DataExtract(i)
        title,Raw_text = Raw_text.extract_text_from_url()
        text = TextAnalysis(StopWords=stopwords,Postive=positive,Negitive=negitive,Raw_text=Raw_text)
        Postive_Score,Negitive_Score,Polarity_Score,Subjectivity_Score = text.Scores()
        Average_sentence_length,Percentage_of_Complex_Words,Fog_Index = text.Analysis_of_Readability()
        Average_number_words_per_sentence= text.Avg_sen()
        Word_count = text.Word_count()
        Syllable_per_word = [text.Syllable_Count()]
        Personal_Pronouns = text.Personal_Pronouns()
        Avg_word_len = text.Wordlen()
        Output_list.append([Postive_Score,Negitive_Score,Polarity_Score,round(Subjectivity_Score,2),round(Average_sentence_length,2),round(Percentage_of_Complex_Words,2),round(Fog_Index,2),round(Average_number_words_per_sentence,2),Word_count,Syllable_per_word,Personal_Pronouns,round(Avg_word_len)])


Creating DataFrame using pandas

In [10]:
values = ['POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUN','AVG WORD LENGTH']
og = pd.DataFrame(Output_list,columns=values)

In [11]:
og.insert(0,'URL_ID', xlsx['URL_ID'])


In [12]:
og.insert(1,'URL',xlsx['URL'])

In [13]:
print(og)

             URL_ID                                                URL  \
0   blackassign0001  https://insights.blackcoffer.com/rising-it-cit...   
1   blackassign0002  https://insights.blackcoffer.com/rising-it-cit...   
2   blackassign0003  https://insights.blackcoffer.com/internet-dema...   
3   blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...   
4   blackassign0005  https://insights.blackcoffer.com/ott-platform-...   
..              ...                                                ...   
95  blackassign0096  https://insights.blackcoffer.com/what-is-the-r...   
96  blackassign0097  https://insights.blackcoffer.com/impact-of-cov...   
97  blackassign0098  https://insights.blackcoffer.com/contribution-...   
98  blackassign0099  https://insights.blackcoffer.com/how-covid-19-...   
99  blackassign0100  https://insights.blackcoffer.com/how-will-covi...   

    POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCORE  SUBJECTIVITY SCORE  \
0                5              -1   

In [14]:
og.to_excel('Text_Analysis.xlsx')

In [15]:
og.to_csv('Text_Analysis.csv')

In [16]:
og.head(10)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUN,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,5,-1,1.0,0.04,13.56,18.31,12.74,13.56,14104,"[{'Word': ['We', 'have', 'seen', 'a', 'huge', ...",1,5
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,51,-27,1.0,0.11,17.49,21.59,15.64,17.49,14104,"[{'Word': ['Throughout', 'history', 'from', 't...",3,6
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,39,-23,1.0,0.1,18.71,29.4,19.24,18.71,14104,"[{'Word': ['IntroductionIn', 'the', 'span', 'o...",13,6
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,37,-70,1.0,0.18,20.13,27.18,18.92,20.13,14104,"[{'Word': ['The', 'way', 'we', 'live', 'work',...",4,6
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,20,-8,1.0,0.08,17.32,20.14,14.98,17.32,14104,"[{'Word': ['The', 'year', '2040', 'is', 'poise...",4,6
5,blackassign0006,https://insights.blackcoffer.com/the-rise-of-t...,84,-25,1.0,0.1,20.14,25.15,18.12,20.14,14104,"[{'Word': ['Entertainment', 'is', 'giving', 'p...",6,6
6,blackassign0007,https://insights.blackcoffer.com/rise-of-cyber...,23,-36,1.0,0.12,15.77,20.51,14.51,15.77,14104,"[{'Word': ['Cybercrime', 'is', 'the', 'most', ...",1,5
7,blackassign0008,https://insights.blackcoffer.com/rise-of-inter...,30,-9,1.0,0.08,15.92,31.79,19.09,15.92,14104,"[{'Word': ['Introduction', 'The', 'year', '203...",3,7
8,blackassign0009,https://insights.blackcoffer.com/rise-of-cyber...,39,-48,1.0,0.15,16.55,31.15,19.08,16.55,14104,"[{'Word': ['As', 'technology', 'continues', 't...",2,6
9,blackassign0010,https://insights.blackcoffer.com/rise-of-cyber...,57,-66,1.0,0.17,19.18,22.8,16.79,19.18,14104,"[{'Word': ['Understanding', 'Cybercrime', 'An'...",8,5
