In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import re
from textblob import TextBlob

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_excel('/kaggle/input/blackcoffer-input/Input.xlsx')

In [5]:
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Assuming article title is within <h1> tags and article text within <p> tags
    title = soup.find('h1').get_text() if soup.find('h1') else 'No Title'
    paragraphs = soup.find_all('p')
    article_text = '\n'.join([p.get_text() for p in paragraphs])
    
    return title, article_text

In [6]:
output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

In [8]:
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    try:
        title, article_text = extract_article_text(url)
        with open(os.path.join(output_dir, f'{url_id}.txt'), 'w', encoding='utf-8') as file:
            file.write(title + '\n\n' + article_text)
        print(f'Extracted and saved article {url_id}')
    except Exception as e:
        print(f'Failed to extract article {url_id}: {e}')

Extracted and saved article bctech2011
Extracted and saved article bctech2012
Extracted and saved article bctech2013
Extracted and saved article bctech2014
Extracted and saved article bctech2015
Extracted and saved article bctech2016
Extracted and saved article bctech2017
Extracted and saved article bctech2018
Extracted and saved article bctech2019
Extracted and saved article bctech2020
Extracted and saved article bctech2021
Extracted and saved article bctech2022
Extracted and saved article bctech2023
Extracted and saved article bctech2024
Extracted and saved article bctech2025
Extracted and saved article bctech2026
Extracted and saved article bctech2027
Extracted and saved article bctech2028
Extracted and saved article bctech2029
Extracted and saved article bctech2030
Extracted and saved article bctech2031
Extracted and saved article bctech2032
Extracted and saved article bctech2033
Extracted and saved article bctech2034
Extracted and saved article bctech2035
Extracted and saved artic

In [9]:
# Initialize stopwords
stop_words = set(stopwords.words('english'))

In [10]:
# Function to count syllables in a word
def count_syllables(word):
    d = nltk.corpus.cmudict.dict()
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        return sum([1 for char in word if char in 'aeiouAEIOU'])

In [11]:
# Function to perform text analysis including additional metrics
def analyze_text_enhanced(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    word_count = len(words)
    sentence_count = len(sentences)
    average_word_length = sum(len(word) for word in words) / word_count
    non_stopwords = [word for word in words if word.lower() not in stop_words]
    non_stopword_count = len(non_stopwords)

    # Sentiment analysis using TextBlob (as a placeholder)
    text_blob = TextBlob(text)
    polarity_score = text_blob.sentiment.polarity
    subjectivity_score = text_blob.sentiment.subjectivity
    positive_score = sum(1 for word in words if text_blob.sentiment.polarity > 0) / word_count
    negative_score = sum(1 for word in words if text_blob.sentiment.polarity < 0) / word_count
    
    # Additional metrics
    avg_sentence_length = word_count / sentence_count
    complex_words = [word for word in words if count_syllables(word) >= 3]
    percentage_complex_words = len(complex_words) / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words * 100)
    avg_words_per_sentence = word_count / sentence_count
    complex_word_count = len(complex_words)
    syllables_per_word = sum(count_syllables(word) for word in words) / word_count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))
    
    return {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'avg_words_per_sentence': avg_words_per_sentence,
        'complex_word_count': complex_word_count,
        'word_count': word_count,
        'syllables_per_word': syllables_per_word,
        'personal_pronouns': personal_pronouns,
        'average_word_length': average_word_length,
    }

In [None]:
# Analyze all extracted articles
results = []

for filename in os.listdir(output_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(output_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            analysis = analyze_text_enhanced(text)
            analysis['url_id'] = filename.replace('.txt', '')
            results.append(analysis)

In [None]:
# Convert enhanced analysis results to DataFrame for easier viewing
output_df = pd.DataFrame(results)
output_df.head()

In [None]:
print('hii')