### Importing Libraries and Data Load

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
data = pd.read_excel("Input.xlsx")

In [3]:
data.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


### Text Extraction

In [4]:
output_folder = "Extracted_Text"

# Creating a Extracted_Text Folder
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [5]:
# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting title and article text
        title = soup.title.text if soup.title else ''
        title = title.replace('-', '')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])

        # Removing extra spaces and special characters
        article_text = re.sub(r'\s+', ' ', article_text).strip()

        return title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {str(e)}")
        return None, None

In [6]:
# Main loop for data extraction
for index, row in data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extracting article text and title for all the websites
    title, article_text = extract_article_text(url)
    if article_text is not None:
        # Saving extracted content to a text file
        file_name = os.path.join(output_folder, f"{url_id}.txt")
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(f"{title}\n")
            file.write(f"{article_text}")

print("Data extraction completed. Text files are saved in the 'Extracted_Text' folder.")

Data extraction completed. Text files are saved in the 'Extracted_Text' folder.


In [7]:
title

'How will COVID19 affect the world of work?  Blackcoffer Insights'

In [8]:
article_text

'Automate the Data Management Process Realtime Kibana Dashboard for a financial tech firm Data Management, ETL, and Data Automation Data Management – EGEAS How To Secure (SSL) Nginx with Let’s Encrypt on Ubuntu (Cloud VM, GCP, AWS, Azure, Linode) and Add Domain Deploy and view React app(Nextjs) on cloud VM such as GCP, AWS, Azure, Linode Deploy Nodejs app on a cloud VM such as GCP, AWS, Azure, Linode Grafana Dashboard – Oscar Awards Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future Internet Demand’s Evolution, Communication Impact, and 2035’s Alternative Pathways Rise of Cybercrime and its Effect in upcoming Future AI/ML and Predictive Modeling Solution for Contact Centre Problems How to Setup Custom Domain for Google App Engine Application? Code Review Checklist As business close to help prevent transmission of COVID-19, fina

### Text Analysis

In [9]:
# Downloading nltk resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Postive and negative word files
positive_words_file = "positive_words.txt"
negative_words_file = "negative_words.txt"

# stop words files
stop_words_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

In [10]:
# Reading positive and negative words

positive_words = set()
negative_words = set()

with open(positive_words_file, 'r') as file:
    positive_words.update(file.read().splitlines())

with open(negative_words_file, 'r', encoding='latin-1') as file:
    negative_words.update(file.read().splitlines())

In [11]:
# Reading stop words
additional_stop_words = set()
for stop_words_file in stop_words_files:
    stop_words_file_path = stop_words_file
    with open(stop_words_file_path, 'r', encoding='latin-1') as file:
        additional_stop_words.update(file.read().splitlines())

In [12]:
# Creating output dataframe
output_columns = ["URL_ID", "URL", "Positive Score", "Negative Score", "Polarity Score",
                  "Subjectivity Score", "Average Sentence Length", "Percentage of Complex Words",
                  "Fog Index", "Average Number of Words Per Sentence", "Complex Word Count",
                  "Word Count", "Syllable Per Word", "Personal Pronouns", "Average Word Length"]

output_data = pd.DataFrame(columns=output_columns)
output_data.head()

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Number of Words Per Sentence,Complex Word Count,Word Count,Syllable Per Word,Personal Pronouns,Average Word Length


In [13]:
def syllable_count(word):
    # Basic syllable counting
    count = 0
    vowels = "aeiou"
    word = word.lower().strip(".:;?!")

    # Handling exceptions for words ending with "es" or "ed"
    if word.endswith("es") or word.endswith("ed"):
        pass
    else:
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1

    return count

In [14]:
# Text Analysis
def analyze_text(row):
    url_id = row["URL_ID"]
    url = row["URL"]

    output_folder = 'Extracted_Text'
    # Extracted text file path
    text_file_path = os.path.join(output_folder, f"{url_id}.txt")

    # Read the extracted text file
    with open(text_file_path, 'r', encoding='utf-8') as text_file:
        text = text_file.read()

    # Tokenization and Cleaning
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in additional_stop_words]
    sentences = sent_tokenize(text)

    # Sentiment Analysis
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    # Readability Analysis
    avg_sentence_length = len(words) / len(sentences)
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = complex_word_count / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_number_of_words_per_sentence = len(words) / len(sentences)

    word_count = len(words)

    # Syllable Count Per Word
    syllable_per_word = sum(syllable_count(word) for word in words) / len(words)

    # Personal Pronouns
    personal_pronouns_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE))

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / len(words)

    return pd.Series({
        "URL_ID": url_id,
        "URL": url,
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Average Sentence Length": avg_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Average Number of Words Per Sentence": avg_number_of_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Word Count": word_count,
        "Syllable Per Word": syllable_per_word,
        "Personal Pronouns": personal_pronouns_count,
        "Average Word Length": avg_word_length
    })

In [15]:
# Applying the text analysis function to each row
output_data = data.apply(analyze_text, axis=1)

# Storing the results to the output file
output_file = "output.xlsx"
output_data.to_excel(output_file, index=False)

print(f"Text analysis results saved to {output_file}")

Text analysis results saved to output.xlsx
