In [1]:
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

positive_words_path = "C:\\Users\\harik\\staragile assessments\\positive-words.txt"
negative_words_path = "C:\\Users\\harik\\staragile assessments\\negative-words.txt"

positive_encoding = detect_encoding(positive_words_path)
negative_encoding = detect_encoding(negative_words_path)

print(f"Encoding for positive words file: {positive_encoding}")
print(f"Encoding for negative words file: {negative_encoding}")


Encoding for positive words file: ascii
Encoding for negative words file: ISO-8859-1


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import textstat
import chardet

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))

# Function to clean and tokenize text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return tokens

# Function to extract article text
def extract_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Attempt to extract main content
        main_content = None
        for tag in ['article', 'div', 'main']:
            main_content = soup.find(tag, {'class': 'main-content'})
            if main_content:
                break

        # Fallback to extracting all paragraphs if main content not found
        if not main_content:
            paragraphs = soup.find_all('p')
        else:
            paragraphs = main_content.find_all('p')
        
        title = soup.find('h1').get_text() if soup.find('h1') else 'No Title'
        article_text = ' '.join([para.get_text() for para in paragraphs])
        return title + ' ' + article_text
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving content from {url}: {e}")
        return None

# Function to detect encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

# Load the input Excel file
input_file_path = "C:\\Users\\harik\\staragile assessments\\input.xlsx"
input_df = pd.read_excel(input_file_path)

# Detect encoding of positive and negative words files
positive_words_path = "C:\\Users\\harik\\staragile assessments\\positive-words.txt"
negative_words_path = "C:\\Users\\harik\\staragile assessments\\negative-words.txt"

positive_encoding = detect_encoding(positive_words_path)
negative_encoding = detect_encoding(negative_words_path)

# Load positive and negative words with detected encoding
positive_words = set(pd.read_csv(positive_words_path, header=None, encoding='ascii')[0].str.lower())
negative_words = set(pd.read_csv(negative_words_path, header=None, encoding='ISO-8859-1')[0].str.lower())

# Initialize a list to store results
results = []

# Loop through each URL and perform extraction and analysis
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    try:
        # Extract the article text
        article_text = extract_text(url)
        if article_text is None:
            raise ValueError(f"Failed to retrieve content from {url}")
        
        # Save the article text to a file
        with open(f'{url_id}.txt', 'w', encoding='utf-8', errors='ignore') as file:
            file.write(article_text)
        
        # Clean and tokenize the text
        tokens = clean_text(article_text)
        word_count = len(tokens)
        sentence_count = len(sent_tokenize(article_text))
        
        # Sentiment analysis
        positive_score = sum(1 for word in tokens if word in positive_words)
        negative_score = sum(1 for word in tokens if word in negative_words)
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
        
        # Readability metrics
        avg_sentence_length = word_count / sentence_count
        complex_words = [word for word in tokens if textstat.syllable_count(word) > 2]
        complex_word_count = len(complex_words)
        percentage_of_complex_words = complex_word_count / word_count
        fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
        
        # Additional metrics
        syllable_count_per_word = sum(textstat.syllable_count(word) for word in tokens) / word_count
        personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', article_text, re.I))
        avg_word_length = sum(len(word) for word in tokens) / word_count
        
        # Store the results
        results.append([
            url_id, url, positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_of_complex_words, fog_index, complex_word_count,
            word_count, syllable_count_per_word, personal_pronouns, avg_word_length
        ])
    
    except Exception as e:
        print(f"Error processing URL_ID {url_id}: {e}")

# Create a DataFrame with the results
columns = [
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]
results_df = pd.DataFrame(results, columns=columns)

# Save the results to an Excel file
output_excel_path = "C:\\Users\\harik\\staragile assessments\\Structure.xlsx"
results_df.to_excel(output_excel_path, index=False)

# Save the results to a CSV file
output_csv_path = "C:\\Users\\harik\\staragile assessments\\Structure.csv"
results_df.to_csv(output_csv_path, index=False)

# Instructions on how to run the script
instructions = """
1. Ensure you have the required Python libraries installed:
   - requests
   - beautifulsoup4
   - pandas
   - nltk
   - textstat
   - chardet

2. Place the script in the same directory as the 'input.xlsx' file.

3. Ensure 'positive-words.txt' and 'negative-words.txt' files are available at the specified paths.

4. Run the script using the command:
   python script.py

5. The script will generate text files for each URL_ID and an output Excel and CSV file with the analysis results.
"""

with open('instructions.txt', 'w', encoding='utf-8', errors='ignore') as file:
    file.write(instructions)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
