In [1]:
pip install requests beautifulsoup4 pandas nltk textstat openpyxl


Collecting textstat
  Obtaining dependency information for textstat from https://files.pythonhosted.org/packages/11/df/bb284dfb23890319ace2a416a5a39e77e29b8f52f5d80bc13b12dc1fc1f5/textstat-0.7.4-py3-none-any.whl.metadata
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
   ---------------------------------------- 0.0/105.1 kB ? eta -:--:--
   --- ------------------------------------ 10.2/105.1 kB ? eta -:--:--
   ---------------------- ---------------- 61.4/105.1 kB 812.7 kB/s eta 0:00:01
   -------------------------------------- 105.1/105.1 kB 862.7 kB/s eta 0:00:00
Installing collected packages: textstat
Successfully installed textstat-0.7.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

input_file = 'Input.xlsx'
data = pd.read_excel(input_file)

urls = data['URL']
url_ids = data['URL_ID']


In [3]:
import requests
from bs4 import BeautifulSoup

for url, url_id in zip(urls, url_ids):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('title').get_text()  # This might need to change based on actual HTML structure
    article_text = ' '.join(p.get_text() for p in soup.find_all('p'))  # This might need to change

    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(title + '\n' + article_text)


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textstat

nltk.download('stopwords')
nltk.download('punkt')

def compute_positive_score(text):
    positive_words = set(open('positive-words.txt').read().split())  # You need to have this file
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in positive_words)

def compute_negative_score(text):
    negative_words = set(open('negative-words.txt').read().split())  # You need to have this file
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in negative_words)

def compute_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

def compute_subjectivity_score(positive_score, negative_score, word_count):
    return (positive_score + negative_score) / (word_count + 0.000001)

def compute_avg_sentence_length(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences)

def compute_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if textstat.syllable_count(word) > 2]
    return len(complex_words) / len(words)

def compute_fog_index(text):
    return textstat.gunning_fog(text)

def compute_avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences)

def compute_complex_word_count(text):
    words = word_tokenize(text)
    return sum(1 for word in words if textstat.syllable_count(word) > 2)

def compute_word_count(text):
    words = word_tokenize(text)
    return len(words)

def compute_syllable_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(textstat.syllable_count(word) for word in words)
    return syllable_count / len(words)

def compute_personal_pronouns(text):
    pronouns = set(['i', 'we', 'my', 'ours', 'us'])
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in pronouns)

def compute_avg_word_length(text):
    words = word_tokenize(text)
    return sum(len(word) for word in words) / len(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import os

results = []

for url_id in url_ids:
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    positive_score = compute_positive_score(text)
    negative_score = compute_negative_score(text)
    polarity_score = compute_polarity_score(positive_score, negative_score)
    subjectivity_score = compute_subjectivity_score(positive_score, negative_score, compute_word_count(text))
    avg_sentence_length = compute_avg_sentence_length(text)
    percentage_complex_words = compute_percentage_complex_words(text)
    fog_index = compute_fog_index(text)
    avg_words_per_sentence = compute_avg_words_per_sentence(text)
    complex_word_count = compute_complex_word_count(text)
    word_count = compute_word_count(text)
    syllable_per_word = compute_syllable_per_word(text)
    personal_pronouns = compute_personal_pronouns(text)
    avg_word_length = compute_avg_word_length(text)

    results.append([
        url_id, positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence,
        complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word_length
    ])

output_df = pd.DataFrame(results, columns=[
    'URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])

output_df.to_excel('Output Data Structure.xlsx', index=False)
