MAin


In [1]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import nltk
import textstat
nltk.download('stopwords')
nltk.download('punkt')


# Read input file
input_data = pd.read_excel('/content/Input.xlsx')

# Function to extract article text from URL
def extract_article_text(url, url_id):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract article title and text while ignoring unwanted content
        title = soup.find('title').get_text() if soup.find('title') else ''
        paragraphs = soup.find_all('p')
        article_text = '\n'.join([p.get_text() for p in paragraphs])

        # Save extracted text into a text file with URL_ID as filename
        with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(f"Title: {title}\n\n{article_text}")

        return True, "Extraction successful"
    except Exception as e:
        return False, f"Extraction failed: {str(e)}"

# Loop through URLs in the input file and extract article text
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    success, message = extract_article_text(url, url_id)
    print(f"URL ID: {url_id} - {message}")

# Function for text analysis and computing variables
def analyze_text(text):
    # Load positive and negative words list
    positive_words = pd.read_csv('/content/positive-words.csv', header=None)[0].tolist()
    negative_words = pd.read_csv('/content/negative-words.csv', header=None)[0].tolist()

    # Load additional stop words lists
    # Load additional stop words lists
    stop_words_files = [
        '/content/StopWords_Auditor.csv', '/content/StopWords_Currencies.csv', '/content/StopWords_DatesandNumbers.csv',
        '/content/StopWords_Generic.csv', '/content/StopWords_GenericLong.csv', '/content/StopWords_Geographic.csv', '/content/StopWords_Names.csv'
    ]
    additional_stopwords = []
    for file in stop_words_files:
        data = pd.read_csv(file, header=None)  # Read each CSV file
        additional_stopwords += data[0].tolist()  # Concatenate the content of each file to additional_stopwords


    # Tokenize text
    words = re.findall(r'\b\w+\b', text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english') + additional_stopwords)
    words = [word for word in words if word not in stop_words]

    # Calculate variables
    positive_score = sum(word in positive_words for word in words)
    negative_score = sum(word in negative_words for word in words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = TextBlob(text).sentiment.subjectivity
    sentences = TextBlob(text).sentences
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    words_count = len(words)
    # complex_words = [word for word in words if len(Word(word).syllables) > 2]
    complex_words = [word for word in words if textstat.syllable_count(word) > 2]
    percentage_complex_words = (len(complex_words) / len(words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = words_count / len(sentences)
    # syllables_per_word = sum(len(Word(word).syllables) for word in words) / len(words)
    syllables_per_word = sum(textstat.syllable_count(word) for word in words) / len(words)

    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])
    avg_word_length = sum(len(word) for word in words) / len(words)

    return [
        positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
        percentage_complex_words, fog_index, avg_words_per_sentence, len(complex_words),
        words_count, syllables_per_word, personal_pronouns, avg_word_length
    ]

# Analyze text from extracted files and store computed variables
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    try:
        with open(f"{url_id}.txt", "r", encoding="utf-8") as file:
            text = file.read()
            computed_variables = analyze_text(text)
            output_row = [url_id] + computed_variables
            output_data.append(output_row)
    except FileNotFoundError as e:
        print(f"File {url_id}.txt not found")

# Create DataFrame for output data and save it to Excel
columns = [
    "URL_ID", "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT", "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
]

output_df = pd.DataFrame(output_data, columns=columns)
output_df.to_excel('Output.xlsx', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


URL ID: test01 - Extraction successful
URL ID: test02 - Extraction successful
URL ID: test03 - Extraction successful
URL ID: test04 - Extraction successful
URL ID: test05 - Extraction successful
