In [None]:
!pip install goose3


Collecting goose3
  Downloading goose3-3.1.18-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect (from goose3)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting langdetect (from goose3)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyahocorasick (from goose3)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: f

In [None]:
#importing all the required libraries
import pandas as pd
import openpyxl
import requests
import nltk
import re
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from goose3 import Goose


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#extracting data from the urls
class URLCleaner:
    def __init__(self, dataset_path):
        self.df = pd.read_excel(dataset_path)
        self.invalid_indices = []

    def url_check(self,url):
      try:
        g = Goose()
        _=g.extract(url)

        return True

      except Exception as e:
        print(f"Error extracting content from {url}: {str(e)}")
        return False

    def clean_dataset(self):
        for index, row in self.df.iterrows():
            url = row['URL']
            check = self.url_check(url)

            if  check == False:
                print(f"Skipping invalid or empty URL at index {index}: {url}")
                self.invalid_indices.append(index)

        # Remove rows with invalid URLs from the DataFrame
        cleaned_df = self.df.drop(index=self.invalid_indices)

        return cleaned_df

In [None]:
def extract_content(url):
        g = Goose()
        article = g.extract(url)

        # Extracting title
        title = article.title.strip() if article.title else None
        # Extracting article text from paragraphs
        article_text = article.cleaned_text
        combined_text = title + ' ' + article_text if title and article_text else title or article_text

        return combined_text

In [None]:
#cleaning the text by removing the stopwords
def clean_text(article_text):
    words = word_tokenize(article_text)
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopwords_final]
    return ' '.join(filtered_words)

In [None]:
#function to create a list of all the stopwords from all the text files
def extract_stopwords(stopwords_file_path,encoding='utf-8'):
  stopwords_list = []

  with open(stopwords_file_path,'r',encoding=encoding,errors='ignore') as file:
    for line in file:
      stopword=[word.strip() for word in line.split('|')]
      stopwords_list.extend(stopword)
  return stopwords_list




In [None]:
#creating a dictionary for posive and negative words
def load_sentiment_dictionary(file_path_positive, file_path_negative):
    sentiment_dict = {}

    # Load positive words
    with open(file_path_positive, 'r',encoding='utf-8',errors='ignore') as file:
        for line in file:
            word = line.strip().lower()
            if word not in stopwords_final:
                sentiment_dict[word] = 'positive'

    # Load negative words
    with open(file_path_negative, 'r',encoding='utf-8',errors='ignore') as file:
        for line in file:
            word = line.strip().lower()
            if word not in stopwords_final:
                sentiment_dict[word] = 'negative'

    return sentiment_dict


In [None]:
#calculating sentimental metrics
def calculate_sentiment_scores(tokens, sentiment_dict):
    positive_score = sum(1 for token in tokens if sentiment_dict.get(token) == 'positive')
    negative_score = -sum(1 for token in tokens if sentiment_dict.get(token) == 'negative')

    polarity_score = (positive_score - negative_score) / max((positive_score + negative_score), 1e-6)
    subjectivity_score = (positive_score + negative_score) / max(len(tokens), 1e-6)

    return positive_score, negative_score, polarity_score, subjectivity_score

In [None]:
#for the analysis part
def analyze_readability(article_text):
    sentences = sent_tokenize(article_text)

    total_words = len(article_text.split())
    average_sentence_length = total_words / len(sentences)

    complex_word_count = sum(1 for word in article_text.split() if len(word) > 2)
    percentage_complex_words = complex_word_count / total_words

    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    average_words_per_sentence = total_words / len(sentences)

    syllable_count_per_word = sum(count_syllables(word) for word in article_text.split())

    personal_pronouns_count = sum(1 for word in re.findall(r'\b(?:i|we|my|ours|us)\b', article_text, flags=re.IGNORECASE))

    average_word_length = sum(len(word) for word in article_text.split()) / total_words

    return (average_sentence_length,percentage_complex_words,fog_index,average_words_per_sentence,complex_word_count,syllable_count_per_word,personal_pronouns_count,average_word_length)

In [None]:
#function to count the syllables
def count_syllables(word):
    vowels = "aeiouy"
    syllable_count = 0

    # Remove trailing "es" or "ed" for exceptions
    if word.endswith(("es", "ed")):
        word = word[:-2]

    # Count vowels, but ignore consecutive vowels
    prev_char_was_vowel = False
    for char in word:
        if char.lower() in vowels:
            if not prev_char_was_vowel:
                syllable_count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    # Adjust for silent 'e' at the end of the word
    if word.endswith("e") and syllable_count > 1:
        syllable_count -= 1

    # Ensure at least one syllable for short words
    syllable_count = max(syllable_count, 1)

    return syllable_count


In [None]:
#COMPILING ALL THE CODE
dataset_path = '/content/Input (1).xlsx'
stopwords_file_path=['/content/StopWords_Auditor.txt','/content/StopWords_Currencies.txt','/content/StopWords_DatesandNumbers.txt','/content/StopWords_Generic.txt','/content/StopWords_GenericLong.txt','/content/StopWords_Geographic.txt','/content/StopWords_Names.txt']
positive_words_file_path='/content/positive-words.txt'
negative_words_file_path='/content/negative-words.txt'

#creating a dataset for feeding in the output
columns=['URL','Positive score','Negative_score','Polarity score','Subjectivity score',\
         'average sentence length','percentage_complex_words','fog_index','average_words_per_sentence',\
         'complex_word_count','syllable_count_per_word','personal_pronouns_count','average_word_length']
output=pd.DataFrame(columns=columns)


#cleaning the dataset
url_cleaner = URLCleaner(dataset_path)
cleaned_dataframe = url_cleaner.clean_dataset()

#creating a list containing all the stopwords
stopwords_final=[]
for input_file in stopwords_file_path:
  stopwords_from_file=extract_stopwords(input_file)
  stopwords_final.extend(stopwords_from_file)

#creating dictionaries for positive and negative words not present in stopwords
sentiment_dictionary=load_sentiment_dictionary(positive_words_file_path,negative_words_file_path)

#extracting the title and content
for index,row in cleaned_dataframe.iterrows():
  url=row['URL']
  article_text=extract_content(url)

  #cleaning the text and retrieving a sting minus the stopwords
  cleaned_text=clean_text(article_text)

  #sentiment scores
  token=word_tokenize(cleaned_text)
  positive_score, negative_score, polarity_score, subjectivity_score=calculate_sentiment_scores(token,sentiment_dictionary)

  #readability analysis
  average_sentence_length, percentage_complex_words, fog_index, average_words_per_sentence, complex_word_count, syllable_count_per_word, personal_pronouns_count, average_word_length = analyze_readability(article_text)

# Writing the values for the output file
  output = output.append({
    'URL': url,
    'Positive score': positive_score,
    'Negative_score': negative_score,
    'Polarity score': polarity_score,
    'Subjectivity score': subjectivity_score,
    'average sentence length': average_sentence_length,  # Adjusted variable name
    'percentage_complex_words': percentage_complex_words,
    'fog_index': fog_index,
    'average_words_per_sentence': average_words_per_sentence,
    'complex_word_count': complex_word_count,
    'syllable_count_per_word': syllable_count_per_word,
    'personal_pronouns_count': personal_pronouns_count,
    'average_word_length': average_word_length
  }, ignore_index=True)


new_excel_file='Output.xlsx'
output.to_excel(new_excel_file,index=False)










Error extracting content from https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: NetworkError: status code: Not Found; reason: 404
Skipping invalid or empty URL at index 35: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error extracting content from https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: NetworkError: status code: Not Found; reason: 404
Skipping invalid or empty URL at index 48: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
  output = output.append({
 