In [7]:
pip install -U textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     ---------------------------------------- 0.0/636.8 kB ? eta -:--:--
     ----- --------------------------------- 92.2/636.8 kB 2.6 MB/s eta 0:00:01
     -------------------------- ----------- 450.6/636.8 kB 5.6 MB/s eta 0:00:01
     -------------------------------------- 636.8/636.8 kB 6.7 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob

# Load the Excel file
input_file = "./Input.xlsx"
output_folder = "./extracted_text"
os.makedirs(output_folder, exist_ok=True)

# Load the Excel data
df = pd.read_excel(input_file)

# Iterate through each row
for index, row in df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]

    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract article title
    article_title = soup.find("title").get_text()

    # Extract article text
    article_text = ""
    for paragraph in soup.find_all("p"):
        article_text += paragraph.get_text() + "\n"

    # Save the extracted content to a text file
    output_filename = os.path.join(output_folder, f"{url_id}.txt")
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(f"Title: {article_title}\n\n")
        file.write(f"Text:\n{article_text}")

    print(f"Extracted and saved content for {url_id}")

print("Extraction and saving completed.")

Extracted and saved content for 123
Extracted and saved content for 321
Extracted and saved content for 2345
Extraction and saving completed.


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
import re
import pandas as pd

nltk.download('punkt')
nltk.download('cmudict')

# Load the URLs from the input Excel file
input_data = pd.read_excel("./Input.xlsx")

# Load positive and negative words
positive_words = set()
negative_words = set()

with open('./MasterDictionary/positive-words.txt', 'r') as file:
    positive_words.update(file.read().splitlines())

with open('./MasterDictionary/negative-words.txt', 'r') as file:
    negative_words.update(file.read().splitlines())

# Load custom stopwords file
custom_stopwords = set()
stopdir="./StopWords"
for filename in os.listdir(stopdir):
    with open(os.path.join(stopdir, filename),'r',encoding='UTF-8') as file:
        custom_stopwords.update(file.read().splitlines())

# Function to count syllables
def syllable_count(word):
    d = cmudict.dict()
    if word.lower() in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
    else:
        return 0

# Function to perform text analysis
def analyze_text(text):
    # Clean the text by removing punctuations and converting to lowercase
    text = re.sub(r'[?!,.]', '', text.lower())

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove custom stopwords
    cleaned_tokens = [word for word in tokens if word not in custom_stopwords]

    word_count = len(cleaned_tokens)
    sentence_count = len(sent_tokenize(text))
    complex_word_count = sum(1 for word in cleaned_tokens if syllable_count(word) > 2)
    personal_pronouns_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text))
    total_characters = sum(len(word) for word in cleaned_tokens)
    avg_word_length = total_characters / word_count

    positive_score = sum(1 for word in cleaned_tokens if word in positive_words)
    negative_score = sum(1 for word in cleaned_tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
    
    avg_sentence_length = word_count / sentence_count
    percentage_complex_words = complex_word_count / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = word_count / sentence_count

    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': total_characters / word_count,
        'PERSONAL PRONOUNS': personal_pronouns_count,
        'AVG WORD LENGTH': avg_word_length
    }

# Iterate through the saved text files and perform analysis
analysis_results_list = []

for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    with open(f'./extracted_text/{url_id}.txt', 'r', encoding='utf-8') as file:
        article_text = file.read()

        # Perform text analysis on the article text
        analysis_results = analyze_text(article_text)
        analysis_results['URL_ID'] = url_id
        
        analysis_results_list.append(analysis_results)

# Create a DataFrame to store the analysis results
output_data = pd.DataFrame(analysis_results_list)

# Merge the output data with the original input data
merged_data = pd.merge(input_data, output_data, on='URL_ID')

# Save the merged data to an Excel file
merged_data.to_excel('./Output Data Structure.xlsx', index=False)