In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [2]:
def extract_article_text(url):
    text = ""  # Initialize text
    try:
        page = requests.get(url)
        page.raise_for_status()
        soup = BeautifulSoup(page.text, 'html.parser')

        # Handling for article text
        text_element = soup.find("div", {'class': 'td-post-content'})
    
        if text_element:
            text = text_element.get_text()
        else:
            print("Text not found on the page.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {str(e)}")
    except Exception as e:
        print(f"Error: {str(e)}")
    return text

def save_to_text_file(url_id, text):
    file_name = f"{url_id}.txt"
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"Text:\n{text}")

def save_to_text_file(url_id, text):
    folder_name = "TextFiles"
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    file_name = os.path.join(folder_name, f"{url_id}.txt")
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"Text:\n{text}")

    print(f"Text saved to: {file_name}")

In [3]:
# Read the Excel file
excel_file = "Input.xlsx"
df = pd.read_excel(excel_file)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract article text
    text = extract_article_text(url)

    # Save to text file
    if text:
        save_to_text_file(url_id, text)
        print(f"Article for {url_id} extracted and saved.")
    else:
        print(f"Failed to extract article for {url_id}.")

print("Extraction process completed.")

Text saved to: TextFiles\blackassign0001.txt
Article for blackassign0001 extracted and saved.
Text saved to: TextFiles\blackassign0002.txt
Article for blackassign0002 extracted and saved.
Text saved to: TextFiles\blackassign0003.txt
Article for blackassign0003 extracted and saved.
Text saved to: TextFiles\blackassign0004.txt
Article for blackassign0004 extracted and saved.
Text saved to: TextFiles\blackassign0005.txt
Article for blackassign0005 extracted and saved.
Text saved to: TextFiles\blackassign0006.txt
Article for blackassign0006 extracted and saved.
Text saved to: TextFiles\blackassign0007.txt
Article for blackassign0007 extracted and saved.
Text saved to: TextFiles\blackassign0008.txt
Article for blackassign0008 extracted and saved.
Text saved to: TextFiles\blackassign0009.txt
Article for blackassign0009 extracted and saved.
Text saved to: TextFiles\blackassign0010.txt
Article for blackassign0010 extracted and saved.
Text saved to: TextFiles\blackassign0011.txt
Article for bla

Text saved to: TextFiles\blackassign0086.txt
Article for blackassign0086 extracted and saved.
Text saved to: TextFiles\blackassign0087.txt
Article for blackassign0087 extracted and saved.
Text saved to: TextFiles\blackassign0088.txt
Article for blackassign0088 extracted and saved.
Text saved to: TextFiles\blackassign0089.txt
Article for blackassign0089 extracted and saved.
Text saved to: TextFiles\blackassign0090.txt
Article for blackassign0090 extracted and saved.
Text saved to: TextFiles\blackassign0091.txt
Article for blackassign0091 extracted and saved.
Text saved to: TextFiles\blackassign0092.txt
Article for blackassign0092 extracted and saved.
Text saved to: TextFiles\blackassign0093.txt
Article for blackassign0093 extracted and saved.
Text saved to: TextFiles\blackassign0094.txt
Article for blackassign0094 extracted and saved.
Text saved to: TextFiles\blackassign0095.txt
Article for blackassign0095 extracted and saved.
Text saved to: TextFiles\blackassign0096.txt
Article for bla

In [4]:
df = df[~df['URL_ID'].isin(['blackassign0036', 'blackassign0049'])]
df.reset_index(inplace=True)

# dropping blackassign0036 and blackassign0049 as the pages are not found or present.

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import os

In [6]:
# Function to clean and tokenize text
def clean_and_tokenize(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    cleaned_words = [word for word in words if word.isalpha() and word not in stop_words]
    return cleaned_words

def load_words_from_file(file_path):
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    return set(words)

# Load positive and negative words from files
positive_words = load_words_from_file('positive-words.txt')
negative_words = load_words_from_file('negative-words.txt')

def calculate_sentiment_scores(text):
    words = clean_and_tokenize(text)
    
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    
    polarity_score = (positive_score - negative_score) / max((positive_score + negative_score), 0.000001)
    subjectivity_score = (positive_score + negative_score) / max((len(words) + 0.000001), 1.0)
    
    return positive_score, negative_score, polarity_score, subjectivity_score
    
    
# Function to calculate readability metrics
def calculate_readability_metrics(text):
    sentences = sent_tokenize(text)
    words = clean_and_tokenize(text)
    
    avg_sentence_length = len(words) / len(sentences)
    percentage_complex_words = sum(1 for word in words if len(word) > 2) / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    return avg_sentence_length, percentage_complex_words, fog_index

# Function to calculate average number of words per sentence
def average_words_per_sentence(text):
    sentences = sent_tokenize(text)
    cleaned_text = clean_and_tokenize(text) 
    return len(cleaned_text) / len(sentences)

# Function to calculate complex word count
def calculate_complex_word_count(text):
    words = clean_and_tokenize(text)
    return sum(1 for word in words if len(word) > 2)

# Function to calculate word count
def calculate_word_count(text):
    return len(clean_and_tokenize(text))

# Function to calculate syllable count per word
def calculate_syllable_per_word(text):
    words = clean_and_tokenize(text)
    syllable_count = sum(len(re.findall(r'[aeiou]+', word)) for word in words)
    return syllable_count / max(1, len(words))  # Avoid division by zero

# Function to count personal pronouns
def count_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(personal_pronouns)

# Function to calculate average word length
def calculate_average_word_length(text):
    words = clean_and_tokenize(text)
    total_chars = sum(len(word) for word in words)
    return total_chars / max(1, len(words))  # Avoid division by zero


In [7]:
directory_path = 'TextFiles/'
file_names = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [8]:
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []
avg_sentence_length = []
percentage_complex_words = []
fog_index = []
avg_words_per_sentence = []
complex_word_count = []
word_count = []
syllable_per_word = []
personal_pronouns_count = []
avg_word_length = []


for file_name in file_names:
    file_path = os.path.join(directory_path, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Example usage of analysis functions
    p_score, n_score, pol_score, subj_score = calculate_sentiment_scores(text)
    positive_score.append(p_score)
    negative_score.append(n_score)
    polarity_score.append(pol_score)
    subjectivity_score.append(subj_score)
        
    avg_sent_length, percent_comp_words, f_index = calculate_readability_metrics(text)
    avg_sentence_length.append(avg_sent_length)
    percentage_complex_words.append(percent_comp_words)
    fog_index.append(f_index)
    
    avg_words_per_sent = average_words_per_sentence(text)
    avg_words_per_sentence.append(avg_words_per_sent)
    
    c_word_count = calculate_complex_word_count(text)
    complex_word_count.append(c_word_count)
    
    w_count = calculate_word_count(text)
    word_count.append(w_count)
    
    syl_per_word = calculate_syllable_per_word(text)
    syllable_per_word.append(syl_per_word)
    
    per_pronouns_count = count_personal_pronouns(text)
    personal_pronouns_count.append(per_pronouns_count)
    
    avg_w_length = calculate_average_word_length(text)
    avg_word_length.append(avg_w_length)

In [9]:
output_data = pd.DataFrame()

In [10]:
output_data['URL_ID'] = df['URL_ID']
output_data['URL'] = df['URL']
output_data['POSITIVE_SCORE'] = positive_score
output_data['NEGATIVE_SCORE'] = negative_score
output_data['POLARITY_SCORE'] = polarity_score
output_data['SUBJECTIVITY_SCORE'] = subjectivity_score
output_data['AVG SENTENCE LENGTH'] = avg_sentence_length
output_data['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
output_data['FOG INDEX'] = fog_index
output_data['AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
output_data['COMPLEX WORD COUNT'] = complex_word_count
output_data['WORD COUNT'] = word_count
output_data['SYLLABLE PER WORD'] = syllable_per_word
output_data['PERSONAL PRONOUNS'] = personal_pronouns_count
output_data['AVG WORD LENGTH'] = avg_word_length

In [11]:
output_data.head()

Unnamed: 0,URL_ID,URL,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,44,6,0.76,0.080386,7.974359,0.996785,3.588457,7.974359,620,622,2.110932,12,6.353698
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,64,31,0.347368,0.112028,10.6,0.995283,4.638113,10.6,844,848,2.479953,6,7.216981
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,40,24,0.25,0.098918,11.350877,0.996909,4.939114,11.350877,645,647,2.806801,13,8.055641
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,39,75,-0.315789,0.17378,12.615385,0.995427,5.444325,12.615385,653,656,2.603659,5,7.821646
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,23,8,0.483871,0.078086,10.179487,1.0,4.471795,10.179487,397,397,2.337531,6,7.128463


In [12]:
output_data.to_excel('Output_Data.xlsx')