In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Load the Excel file
df = pd.read_excel(r"C:\Users\tejas\Downloads\20211030 Test Assignment-20241016T161448Z-001\20211030 Test Assignment\Input.xlsx")

# Create a directory for saving extracted files
output_folder = (r"C:\Users\tejas\OneDrive\Desktop\extracted text")
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

def extract_data(url):
    try:
        response = requests.get(url)

        if response.status_code == 200:

            soup = BeautifulSoup(response.content, 'html.parser')

            title = soup.find('h1', class_='entry-title')
            title_text = title.get_text(strip=True) if title else 'No title found'

            content = soup.find(class_='td-ss-main-content')
            extracted_text = ''
            if content:

                extracted_texts = set()

                for tag in content.find_all(['h1', 'h3', 'p', 'li']):
                    text = tag.get_text(strip=True)

                    if text and text not in extracted_texts:
                        extracted_text += text + '\n'
                        extracted_texts.add(text)
            return title_text, extracted_text
        else:
            return 'Failed to retrieve webpage', ''
    except Exception as e:
        return f'Error occurred: {e}', ''

def save_to_file(url_id, title, content):
    file_name = os.path.join(output_folder, f'{url_id}.txt')  # Use the output folder path
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n\n")
        file.write(content)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    title, content = extract_data(url)

    save_to_file(url_id, title, content)

    print(f"Data extracted and saved for URL_ID:" f"{url_id}.txt")
    print(" Code Executed")


Data extracted and saved for URL_ID:bctech2011.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2012.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2013.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2014.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2015.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2016.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2017.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2018.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2019.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2020.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2021.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2022.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2023.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2024.txt
 Code Executed
Data extracted and saved for URL_ID:bctech2025.txt
 Code Executed
Data extra

In [5]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation


class TextAnalyzer:
    def __init__(self, stopwords_folder, positive_file, negative_file):
        # Load word lists efficiently using set comprehension
        self.stop_words = {
            word.strip() 
            for file in os.listdir(stopwords_folder) 
            if file.endswith('.txt')
            for word in open(os.path.join(stopwords_folder, file), 'r', encoding='utf-8', errors='ignore')
            if word.strip()
        }
        
        # Load sentiment words
        self.positive_words = {word.strip() for word in open(positive_file, 'r', encoding='utf-8', errors='ignore') if word.strip()}
        self.negative_words = {word.strip() for word in open(negative_file, 'r', encoding='utf-8', errors='ignore') if word.strip()}
        
        self.pronouns_pattern = re.compile(r'\b(I|we|my|ours|us)\b', re.IGNORECASE)
        self.clean_pattern = re.compile(r'[^\w\s.]')
        self.digits_pattern = re.compile(r'\d+')

    def analyze_text(self, text):
        # Clean text and tokenize in one pass
        clean_text = self.digits_pattern.sub('', self.clean_pattern.sub(' ', text.lower()))
        sentences = sent_tokenize(clean_text)
        words = [word for word in word_tokenize(clean_text) 
                if word not in self.stop_words and word not in punctuation]
        
        word_count = len(words)
        sentence_count = len(sentences)
        
        if word_count == 0 or sentence_count == 0:
            return self._empty_results()
        
        # Calculate scores using generator expressions
        positive_score = sum(1 for word in words if word in self.positive_words)
        negative_score = sum(1 for word in words if word in self.negative_words)
        
        # Calculate complex words and syllables in single pass
        complex_count = 0
        total_syllables = 0
        total_chars = 0
        
        for word in words:
            syllables = self._count_syllables(word)
            total_syllables += syllables
            if syllables > 2:
                complex_count += 1
            total_chars += len(word)
        
      
        avg_sent_len = word_count / sentence_count
        percent_complex = (complex_count / word_count) * 100
        
        return {
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': (positive_score - negative_score) / (positive_score + negative_score + 0.000001),
            'SUBJECTIVITY SCORE': (positive_score + negative_score) / (word_count + 0.000001),
            'AVG SENTENCE LENGTH': avg_sent_len,
            'PERCENTAGE OF COMPLEX WORDS': percent_complex,
            'FOG INDEX': 0.4 * (avg_sent_len + percent_complex),
            'AVG NUMBER OF WORDS PER SENTENCE': avg_sent_len,
            'COMPLEX WORD COUNT': complex_count,
            'WORD COUNT': word_count,
            'SYLLABLE PER WORD': total_syllables / word_count,
            'PERSONAL PRONOUNS': len(self.pronouns_pattern.findall(text)),
            'AVG WORD LENGTH': total_chars / word_count
        }

    def _count_syllables(self, word):
        if word.endswith(('es', 'ed')):
            word = word[:-2]
        count = len([i for i, char in enumerate(word) 
                    if char in 'aeiouy' and (i == 0 or word[i-1] not in 'aeiouy')])
        return max(1, count)

    def _empty_results(self):
        return {key: 0 for key in [
            'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
            'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
            'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
            'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
        ]}

def analyze_files(text_folder, stopwords_folder, positive_file, negative_file, input_excel):
    analyzer = TextAnalyzer(stopwords_folder, positive_file, negative_file)
    url_df = pd.read_excel(input_excel)
    
    results = []
    for file_name in os.listdir(text_folder):
        if file_name.endswith('.txt'):
            url_id = file_name[:-4]
            with open(os.path.join(text_folder, file_name), 'r', encoding='utf-8', errors='ignore') as f:
                analysis = analyzer.analyze_text(f.read())
                analysis['URL_ID'] = url_id
                results.append(analysis)
    
    df_results = pd.DataFrame(results)
    df_merged = pd.merge(df_results, url_df, on='URL_ID', how='left')
    
    columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
               'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
               'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
               'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
    
    df_merged[columns].to_excel('Output Data Structure.xlsx', index=False)
    return df_merged

TEXT_FOLDER = (r"C:\Users\tejas\OneDrive\Desktop\extracted text")  
STOPWORDS_FOLDER = (r"C:\Users\tejas\Downloads\20211030 Test Assignment-20241016T161448Z-001\20211030 Test Assignment\StopWords")  
POSITIVE_WORDS_FILE = (r"C:\Users\tejas\Downloads\20211030 Test Assignment-20241016T161448Z-001\20211030 Test Assignment\MasterDictionary\positive-words.txt")  
NEGATIVE_WORDS_FILE = (r"C:\Users\tejas\Downloads\20211030 Test Assignment-20241016T161448Z-001\20211030 Test Assignment\MasterDictionary\negative-words.txt")  
INPUT_EXCEL = (r"C:\Users\tejas\Downloads\20211030 Test Assignment-20241016T161448Z-001\20211030 Test Assignment\Input.xlsx")  

results_df = analyze_files(
    TEXT_FOLDER,
    STOPWORDS_FOLDER,
    POSITIVE_WORDS_FILE,
    NEGATIVE_WORDS_FILE,
    INPUT_EXCEL
)

results_df.head()

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH,URL_ID,URL
0,141,49,0.484211,0.088993,11.119792,51.241218,24.944404,11.119792,1094,2135,2.714286,2,7.8,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,21,6,0.555556,0.068182,7.615385,44.191919,20.722922,7.615385,175,396,2.722222,1,7.939394,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,21,10,0.354839,0.066381,13.342857,37.687366,20.412089,13.342857,176,467,2.456103,1,7.505353,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,12,7,0.263158,0.049608,7.226415,43.86423,20.436258,7.226415,168,383,2.613577,1,7.634465,bctech2014,https://insights.blackcoffer.com/effective-man...
4,15,3,0.666667,0.044226,14.034483,43.488943,23.00937,14.034483,177,407,2.520885,1,7.400491,bctech2015,https://insights.blackcoffer.com/streamlined-t...
