In [7]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Change working directory to the desired path
import os
os.chdir('/content/drive/MyDrive/Text_Extraction_&_Analysis')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!dir

Input.xlsx


In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

input_data = pd.read_excel('Input.xlsx')

input_data.head()


Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [12]:
# Define the folder path for saving extracted text files
output_folder = '/content/drive/MyDrive/Text_Extraction_&_Analysis/Extracted_Text_files'

# Check if the folder exists, and create it if it doesn't
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [13]:
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    response = requests.get(url)
    html_content = response.text

    # Parse HTML and extract article content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract article title and text here
    article_title = soup.find('h1')
    article_text_elements = soup.find_all("div",class_=["td-post-content", "tagdiv-type"])

    # Extract text from each <div> element while excluding <pre> elements
    article_text = ""
    for element in article_text_elements:
        for pre in element.find_all("pre"):
            pre.decompose()

        article_text += element.get_text() + '\n\n'


    # Save extracted content to a text file in the specified folder
    with open(os.path.join(output_folder, f'{url_id}.txt'), 'w', encoding='utf-8') as file:
        if article_title:
            file.write(article_title.text.strip() + '\n')
        if article_text:
            file.write(article_text.strip())



## Data Analysis

In [14]:
from textblob import TextBlob
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('cmudict')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Define the folder path for saving extracted text files and the output Excel file
text_files_folder = '/content/drive/MyDrive/Text_Extraction_&_Analysis/Extracted_Text_files'
output_excel_path = '/content/drive/MyDrive/Text_Extraction_&_Analysis/Output Data Structure.xlsx'


In [16]:
# Initialize DataFrame for storing output
output_data = pd.DataFrame(columns=[
    'URL_ID',
    'URL',
    'POSITIVE SCORE',
    'NEGATIVE SCORE',
    'POLARITY SCORE',
    'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH',
    'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX',
    'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT',
    'WORD COUNT',
    'SYLLABLE PER WORD',
    'PERSONAL PRONOUNS',
    'AVG WORD LENGTH'
])

for filename in os.listdir(text_files_folder):
    if filename.endswith('.txt'):
        url_id = filename[:-4]  # Remove the '.txt' extension to get the URL_ID
        url = f'URL placeholder for {url_id}'  # Replace with the actual URL or leave as a placeholder

        # Initialize a CMU pronunciation dictionary
        cmu_dict = cmudict.dict()

        # Load the extracted text
        with open(os.path.join(text_files_folder, filename), 'r', encoding='utf-8') as file:
            article_text = file.read()

        # Perform text analysis using TextBlob
        blob = TextBlob(article_text)

        # Calculate positive and negative scores using TextBlob's sentiment analysis
        positive_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity > 0)
        negative_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity < 0)

        # Calculate polarity and subjectivity scores using TextBlob
        polarity_score = blob.sentiment.polarity
        subjectivity_score = blob.sentiment.subjectivity

        # Tokenize the text into sentences and words using NLTK
        sentences = sent_tokenize(article_text)
        words = word_tokenize(article_text)

        # Initialize variables to store analysis results
        avg_sentence_length = 0  # Default to 0 in case there are no sentences
        avg_words_per_sentence = 0
        percentage_complex_words = 0
        syllable_per_word = 0
        avg_word_length = 0

        # Check if there are sentences and words to avoid division by zero
        if len(sentences) > 0:
            avg_sentence_length = len(words) / len(sentences)
            avg_words_per_sentence = len(words) / len(sentences)

        if len(words) > 0:
            # Calculate percentage of complex words (words with more than 2 syllables)
            syllable_count = 0
            complex_word_count = 0
            for word in words:
                if word.lower() in cmu_dict:
                    syllables = [len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]]
                    syllable_count += max(syllables)
                    if max(syllables) > 2:
                        complex_word_count += 1
            percentage_complex_words = (complex_word_count / len(words)) * 100

            # Calculate Fog Index
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

            # Calculate syllable count per word
            if len(words) > 0:
                syllable_per_word = syllable_count / len(words)

            # Calculate average word length
            if len(words) > 0:
                avg_word_length = sum(len(word) for word in words) / len(words)

        # Count personal pronouns
        personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'we', 'my', 'ours', 'us'])

        # Append results to the output_data DataFrame
        output_data = output_data.append({
            'URL_ID': url_id,
            'URL': url,
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
            'FOG INDEX': fog_index,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': len(words),
            'SYLLABLE PER WORD': syllable_per_word,
            'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        }, ignore_index=True)

output_data['URL'] = output_data['URL'].apply(lambda x: f'=HYPERLINK("{x}", "{x}")')
# Save the output_data DataFrame
output_data.to_excel(output_excel_path, index=False)


  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
  output_data = output_data.append({
 