<a href="https://colab.research.google.com/github/Slayerma/mid-term/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install beautifulsoup4
!pip install openpyxl
!pip install textblob
!python -m textblob.download_corpora
!pip install syllables

import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import syllables  # Install using: pip install syllables
import pandas as pd

# Function to extract article text from a given URL
def extract_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract only article title and text
        article_title = soup.title.text if soup.title else ''
        article_text = ' '.join([p.text for p in soup.find_all('p')])

        return article_title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {str(e)}")
        return '', ''

# Function to perform textual analysis and compute variables
def analyze_text(text):
    blob = TextBlob(text)

    positive_score = blob.sentiment.polarity
    negative_score = -positive_score
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    sentences = blob.sentences
    avg_sentence_length = sum(len(sentence.words) for sentence in sentences) / len(sentences)
    complex_words = [word for word in blob.words if syllables.estimate(word) > 2]
    percentage_complex_words = (len(complex_words) / len(blob.words)) * 100

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = len(blob.words) / len(sentences)
    complex_word_count = len(complex_words)
    word_count = len(blob.words)
    syllables_per_word = sum(syllables.estimate(word) for word in blob.words) / len(blob.words)

    personal_pronouns = sum(1 for word in blob.words if word.lower() in ['i', 'me', 'my', 'mine', 'myself'])

    avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)

    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, word_count,
            syllables_per_word, personal_pronouns, avg_word_length]

# Load input URLs from Excel file
input_df = pd.read_excel('/content/Input.xlsx')

# Iterate through each row and perform data extraction and analysis
output_data = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract text from the URL
    article_title, article_text = extract_text(url)

    # Perform textual analysis
    variables = analyze_text(article_text)

    # Append the results to the output_data list
    output_data.append([url_id, *variables])

# Create output DataFrame
output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                  'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                  'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                  'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

output_df = pd.DataFrame(output_data, columns=output_columns)

# Save the output DataFrame to Excel file
output_df.to_excel('Output_Data.xlsx', index=False)


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.
Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.16-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [7]:

import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import syllables
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download the NLTK stopwords dataset
import nltk
nltk.download('stopwords')

# Function to extract article text from a given URL
def extract_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract only article title and text
        article_title = soup.title.text if soup.title else ''
        article_text = ' '.join([p.text for p in soup.find_all('p')])

        return article_title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {str(e)}")
        return '', ''

# Function to clean text using stop words lists
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    cleaned_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(cleaned_tokens)

# Function to perform textual analysis and compute variables
def analyze_text(text):
    cleaned_text = clean_text(text)
    blob = TextBlob(cleaned_text)

    # Sentimental Analysis
    positive_score = len([word for word in blob.words if word in positive_words])
    negative_score = len([word for word in blob.words if word in negative_words])
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(blob.words) + 0.000001)

    # Analysis of Readability
    sentences = blob.sentences
    avg_sentence_length = sum(len(sentence.words) for sentence in sentences) / len(sentences)
    complex_words = [word for word in blob.words if syllables.estimate(word) > 2]
    percentage_complex_words = (len(complex_words) / len(blob.words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Additional Analysis
    avg_words_per_sentence = len(blob.words) / len(sentences)
    complex_word_count = len(complex_words)
    word_count = len(blob.words)
    syllables_per_word = sum(syllables.estimate(word) for word in blob.words) / len(blob.words)

    personal_pronouns = len(re.findall(r'\b(?:I|we|my|ours|us)\b', cleaned_text, flags=re.IGNORECASE))
    avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)

    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, word_count,
            syllables_per_word, personal_pronouns, avg_word_length]

# Load positive and negative words from master dictionary
positive_words = set(pd.read_csv('/content/positive-words.txt', header=None, encoding='latin1')[0])
negative_words = set(pd.read_csv('/content/negative-words.txt', header=None, encoding='latin1')[0])

# Load input URLs from Excel file
input_df = pd.read_excel('/content/Input.xlsx')

# Iterate through each row and perform data extraction and analysis
output_data = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract text from the URL
    article_title, article_text = extract_text(url)

    # Perform textual analysis
    variables = analyze_text(article_text)

    # Append the results to the output_data list
    output_data.append([url_id, *variables])

# Create output DataFrame
output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                  'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                  'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                  'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

output_df = pd.DataFrame(output_data, columns=output_columns)

# Save the output DataFrame to Excel file
output_df.to_excel('/content/Output_Data.xlsx', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
