In [31]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer


In [32]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Read the Excel file
df = pd.read_excel("input.xlsx")

# Function to extract article text from URL


def extract_article_text(URL):
    try:

        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
# Here the user agent is for Edge browser on windows 10. You can find your browser user agent from the above given link.
        # Send a GET request to fetch the webpage content
        response = requests.get(url=URL, headers=headers)
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the article title
        title = soup.find('title').text.strip()
        # Find the article text
        article_text = soup.find('div', class_='td-post-content tagdiv-type').text.strip()
        return title, article_text
    except Exception as e:
        print("Error extracting article from {url}: {e}")
        return None, None


# Create a directory to save text files if it doesn't exist
if not os.path.exists("extracted_articles"):
    os.makedirs("extracted_articles")

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Extract URL and URL_ID from the DataFrame
    URL = row['URL']
    URL_ID = row['URL_ID']

    # Extract article text from UR
    title, article_text = extract_article_text(URL)

    if title and article_text:
        # Save the extracted article text to a text file
        filename = f"extracted_articles/{URL_ID}.txt"
        with open(filename, "w", encoding="utf-8") as file:
            file.write(title + "\n\n")
            file.write(article_text)
        print(f"Article extracted and saved: {filename}")
    else:
        print(f"Article extraction failed for URL_ID: {URL_ID}")

print("Extraction completed.")

Article extracted and saved: extracted_articles/blackassign0001.txt
Article extracted and saved: extracted_articles/blackassign0002.txt
Article extracted and saved: extracted_articles/blackassign0003.txt
Article extracted and saved: extracted_articles/blackassign0004.txt
Article extracted and saved: extracted_articles/blackassign0005.txt
Article extracted and saved: extracted_articles/blackassign0006.txt
Article extracted and saved: extracted_articles/blackassign0007.txt
Article extracted and saved: extracted_articles/blackassign0008.txt
Article extracted and saved: extracted_articles/blackassign0009.txt
Article extracted and saved: extracted_articles/blackassign0010.txt
Article extracted and saved: extracted_articles/blackassign0011.txt
Article extracted and saved: extracted_articles/blackassign0012.txt
Article extracted and saved: extracted_articles/blackassign0013.txt
Error extracting article from {url}: {e}
Article extraction failed for URL_ID: blackassign0014
Article extracted and

In [18]:
# Load stop words
stop_words_path = './StopWords/'
stop_words_files = os.listdir(stop_words_path)
stop_words = set()
for file in stop_words_files:
    with open(os.path.join(stop_words_path, file), 'r') as f:
        stop_words.update(f.read().splitlines())
        
# Function to remove stop words
def remove_stop_words(text, stop_words):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [25]:
# Function to remove stop words
def remove_stop_words(text, stop_words):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [27]:
# Load positive and negative words
positive_words_path = './MasterDictionary/positive-words.txt'
negative_words_path = './MasterDictionary/negative-words.txt'
with open(positive_words_path, 'r') as f:
    positive_words = set(f.read().splitlines())
with open(negative_words_path, 'r') as f:
    negative_words = set(f.read().splitlines())
# Function to count positive and negative words
def count_sentiment_words(text, positive_words, negative_words):
    positive_count = sum(1 for word in text.split() if word in positive_words)
    negative_count = sum(1 for word in text.split() if word in negative_words)
    return positive_count, negative_count


In [30]:

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Process each file
output_data = {'file_name': [], 'POSITIVE SCORE': [], 'NEGATIVE SCORE': []}
articles_folder = './extracted_articles/'
for file_name in os.listdir(articles_folder):
    with open(os.path.join(articles_folder, file_name), 'r', encoding='utf-8') as f:
        article_text = f.read()
        # Remove stop words
        clean_text = remove_stop_words(article_text, stop_words)
        # Count positive and negative words
        positive_count, negative_count = count_sentiment_words(clean_text, positive_words, negative_words)
        output_data['file_name'].append(file_name)
        output_data['POSITIVE SCORE'].append(positive_count)
        output_data['NEGATIVE SCORE'].append(negative_count)

# Write output to Excel
output_df = pd.DataFrame(output_data)
output_df.to_excel('output.xlsx', index=False)
