In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

# read the input file using pandas
df = pd.read_excel("Input.xlsx")
# create the directory if it doesn't exist
if not os.path.exists("scraped_txts"):
    os.mkdir("scraped_txts")
# iterate through each row in the dataframe
for index, row in df.iterrows():
    url = row["URL"]
    url_id = row["URL_ID"]
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    try:
      # Extract the title
      title = soup.find("h1").get_text().strip()

      # Extract the article text
      article = soup.find("article")
      paragraphs = article.find_all("p")
      article_text = "\n".join([p.get_text().strip() for p in paragraphs])

      with open(f"scraped_txts/{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(title + "\n\n" + article_text)

    except Exception as error:
      print(f"Error occurred for URL: {url}\nError message: {str(error)}")
      continue

    # print("Title:", title)
    # print("Article Text:", article_text)
   
    

In [None]:
# !zip -r ./scraped_txts.zip ./scraped_txts/

In [None]:
# from google.colab import files
# files.download('/content/scraped_txts.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import glob

stop_words = set()

# Getting a list of all txt files in the directory
txt_files = glob.glob("/content/StopWords/*.txt")

# Iterate over each txt file and update the set
for file_name in txt_files:
    with open(file_name, "r", encoding="cp1252") as file:
        stop_words.update(file.read().splitlines())

In [None]:
import nltk
from textblob import TextBlob

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
directory = "scraped_txts" 

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        
        
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        words = word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        clean_text = " ".join(filtered_words)

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(clean_text)

In [None]:
pip install syllables

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting syllables
  Downloading syllables-1.0.7-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.13-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.3/939.3 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<6.0.0,>=5.1.0 (from syllables)
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Installing collected packages: importlib-metadata, cmudict, syllables
Successfully installed cmudict-1.0.13 importlib-metadata-5.2.0 syllables-1.0.7


In [None]:
import pandas as pd
import glob
import string
import syllables


positive_words = set()
negative_words = set()
with open("positive-words.txt", "r", encoding="utf-8") as file:
    positive_words.update(file.read().splitlines())

with open("negative-words.txt", "r", encoding="cp1252") as file:
    negative_words.update(file.read().splitlines())


txt_files = glob.glob("/content/scraped_txts/*.txt")


output_df = pd.DataFrame(columns=["URL_ID", "URL", "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE",
                                  "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS",
                                  "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT",
                                  "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"])


for file_name in txt_files:
    with open(file_name, "r", encoding="utf-8") as file:
        text = file.read()


    punctuation = set(string.punctuation)
    words = [word for word in text.split() if word not in punctuation]
    word_count = len(words)
    sentence_count = text.count(".") + text.count("!") + text.count("?")
    avg_sentence_length = word_count / sentence_count

    positive_score = sum(word in positive_words for word in words)
    negative_score = sum(word in negative_words for word in words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

    complex_words = [word for word in words if syllables.estimate(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / word_count) * 100

    
    sentences = [sentence.strip() for sentence in text.split(".") if sentence.strip()]
    avg_words_per_sentence = sum(len(sentence.split()) for sentence in sentences) / sentence_count
    
    syllables_per_word = sum(syllables.estimate(word) for word in words) / word_count

   
    personal_pronouns = ["I", "me", "my", "mine", "we", "us", "our", "ours"]
    personal_pronoun_count = sum(word.lower() in personal_pronouns for word in words)

    word_lengths = [len(word) for word in words]
    avg_word_length = sum(word_lengths) / word_count

    
    output_df = output_df.append({
        "URL_ID": url_id,  
        "URL": url,  
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": 0.4 * (avg_words_per_sentence + percentage_complex_words),
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": personal_pronoun_count,
        "AVG WORD LENGTH": avg_word_length
    }, ignore_index=True)


output_df.to_excel("output.xlsx", index=False)

  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_d