In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read the input Excel file
input_file_path = 'input.xlsx'
df = pd.read_excel(input_file_path)

# Function to extract article text from a given URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title and article text
        title = soup.title.text.strip()
        article_text = ' '.join([p.text.strip() for p in soup.find_all('p')])

        return title, article_text
    except Exception as e:
        print(f"Error extracting data from {url}: {str(e)}")
        return None, None

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract article text
    title, article_text = extract_article_text(url)

    # Save the extracted data to a text file
    if title and article_text:
        output_file_path = f'{url_id}.txt'
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(f'Title: {title}\n\n')
            file.write(f'Article Text: {article_text}')

        print(f"Data extracted from {url} and saved to {output_file_path}")

print("Extraction complete.")


Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved to blackassign0001.txt
Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved to blackassign0002.txt
Data extracted from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved to blackassign0003.txt
Data extracted from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved to blackassign0004.txt
Data extracted from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved to blackassign0005.txt
Data extracted from https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/ and saved to blackassi

In [8]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import syllables

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to calculate text analysis variables
def calculate_text_analysis(text):
    # Cleaning using Stop Words Lists
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]

    # Analysis of Readability
    sentences = sent_tokenize(text)
    avg_sentence_length = len(words) / len(sentences)
    complex_words = [word for word in words if syllables.estimate(word) > 2]
    percentage_complex_words = (len(complex_words) / len(words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Complex Word Count
    complex_word_count = len(complex_words)

    # Word Count
    word_count = len(words)

    # Syllable Count Per Word
    syllable_per_word = sum(syllables.estimate(word) for word in words) / len(words)

    # Sentiment Analysis using TextBlob
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Personal Pronouns
    personal_pronouns = len(re.findall(r'\b(?:i|we|my|ours|us)\b', text, flags=re.IGNORECASE))

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / len(words)

    return [
        polarity_score,
        -polarity_score,
        polarity_score,
        subjectivity_score,
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_words_per_sentence,
        complex_word_count,
        word_count,
        syllable_per_word,
        personal_pronouns,
        avg_word_length
    ]

# Read the output structure Excel file
output_structure_file_path = 'Output Data Structure.xlsx'
output_df = pd.read_excel(output_structure_file_path)

# Iterate through each row in the output DataFrame
for index, row in output_df.iterrows():
    url_id = row['URL_ID']

    # Read the extracted text from the corresponding text file
    text_file_path = f'{url_id}.txt'
    with open(text_file_path, 'r', encoding='utf-8') as file:
        extracted_text = file.read()

    # Calculate text analysis variables
    analysis_results = calculate_text_analysis(extracted_text)

    # Update the output DataFrame with the computed values
    output_df.iloc[index, 2:] = analysis_results

# Save the updated output DataFrame to a new Excel file
output_df.to_excel('TextAnalysisOutput.xlsx', index=False)
print("Textual analysis complete.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BUTALOP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BUTALOP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Textual analysis complete.
