<a href="https://colab.research.google.com/github/SPARSHTHALYARI/WebSrcapper/blob/new%2Cupdated%2Cwith-Sentiment-Analyze/webScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import nltk

# Download the "punkt" tokenizer model
nltk.download('punkt')

import nltk

# Download the "stopwords" resource
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import pandas as pd

# Define the URL of the webpage you want to scrape
url = 'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/'

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the main content of the article
    article_content = soup.find('div', class_='td-post-content')

    if article_content:
        # Extract the article title (usually within an <h1> or <h2> tag)
        try:
            article_title = article_content.find('h1').get_text()
        except AttributeError:
            article_title = "Untitled"  # Use a default title if <h1> is not found

        # Extract the article text from the main content
        article_text = ""
        for paragraph in article_content.find_all('p'):
            article_text += paragraph.get_text() + "\n"

        # Create a DataFrame for analysis results
        analysis_results = {
            "Article Title": [article_title],
            "Article Text": [article_text]
        }
        df = pd.DataFrame(analysis_results)

        # Tokenize the text into words
        words = nltk.word_tokenize(article_text)

        # Remove stop words (common words like "the," "and," "is," etc.)
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]

        # Calculate word frequency
        word_freq = nltk.FreqDist(filtered_words)

        # Analyze sentiment using TextBlob
        text_blob = TextBlob(article_text)
        sentiment_score = text_blob.sentiment.polarity

        # Extract keywords (most common words)
        num_keywords = 10  # Specify the number of top keywords to extract
        keywords = [word for word, freq in word_freq.most_common(num_keywords)]

        # Add analysis results to the DataFrame
        df["Most Common Words"] = [keywords]
        df["Sentiment Score"] = [sentiment_score]

        # Save the DataFrame to an XLSX file
        df.to_excel('Output Data Structure.xlsx', index=False)

        print("Analysis results have been saved to 'analysis_results.xlsx'.")

    else:
        print("Failed to find the article content on the webpage.")

else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)


Analysis results have been saved to 'analysis_results.xlsx'.


In [9]:
import openpyxl
import nltk
from textblob import TextBlob
import re

# Define a function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouyAEIOUY"
    count = 0
    prev_char = None
    for char in word:
        if char in vowels and (prev_char is None or prev_char not in vowels):
            count += 1
        prev_char = char
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count = 1  # At least one syllable for a word
    return count

# Load the Excel file
xlsx_file = 'Output Data Structure.xlsx'
workbook = openpyxl.load_workbook(xlsx_file)
sheet = workbook.active

# Initialize variables
positive_score = 0
negative_score = 0
total_polarity_score = 0
total_subjectivity_score = 0
total_sentence_length = 0
total_complex_words = 0
total_word_count = 0
total_syllables = 0
total_personal_pronouns = 0
total_word_length = 0

# Iterate through rows in the Excel sheet to calculate variables
for row in sheet.iter_rows(min_row=2, values_only=True):
    # Extract the text from the Excel sheet (assuming it's in the second column)
    text = row[1]

    # Check if the cell contains a non-empty string
    if isinstance(text, str) and text.strip():
        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)

        for sentence in sentences:
            # Calculate polarity and subjectivity scores for the sentence
            blob = TextBlob(sentence)
            polarity_score = blob.sentiment.polarity
            subjectivity_score = blob.sentiment.subjectivity

            # Calculate sentence length (number of words)
            words = nltk.word_tokenize(sentence)
            sentence_length = len(words)

            # Count complex words (words with more than 2 syllables)
            complex_words = [word for word in words if count_syllables(word) > 2]

            # Calculate syllables and word count for the sentence
            syllables = sum([count_syllables(word) for word in words])
            word_count = len(words)

            # Count personal pronouns (you can extend the list of pronouns as needed)
            personal_pronouns = re.findall(r'\b(I|me|my|mine|we|us|our|ours|you|your|yours|he|him|his|she|her|hers|it|its|they|them|their|theirs)\b', sentence, flags=re.IGNORECASE)

            # Calculate average word length for the sentence
            word_length = sum([len(word) for word in words]) / word_count if word_count > 0 else 0

            # Update total scores and counts
            positive_score += max(0, polarity_score)
            negative_score += max(0, -polarity_score)
            total_polarity_score += polarity_score
            total_subjectivity_score += subjectivity_score
            total_sentence_length += sentence_length
            total_complex_words += len(complex_words)
            total_word_count += word_count
            total_syllables += syllables
            total_personal_pronouns += len(personal_pronouns)
            total_word_length += word_length

# Calculate average values
num_sentences = sheet.max_row - 1  # Subtract header row
avg_sentence_length = total_sentence_length / num_sentences if num_sentences > 0 else 0
percentage_of_complex_words = (total_complex_words / total_word_count) * 100 if total_word_count > 0 else 0
fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
avg_number_of_words_per_sentence = total_word_count / num_sentences if num_sentences > 0 else 0
syllable_per_word = total_syllables / total_word_count if total_word_count > 0 else 0
personal_pronouns = total_personal_pronouns

# Print the calculated variables
print("POSITIVE SCORE:", positive_score)
print("NEGATIVE SCORE:", negative_score)
print("POLARITY SCORE:", total_polarity_score)
print("SUBJECTIVITY SCORE:", total_subjectivity_score)
print("AVG SENTENCE LENGTH:", avg_sentence_length)
print("PERCENTAGE OF COMPLEX WORDS:", percentage_of_complex_words)
print("FOG INDEX:", fog_index)
print("AVG NUMBER OF WORDS PER SENTENCE:", avg_number_of_words_per_sentence)
print("COMPLEX WORD COUNT:", total_complex_words)
print("WORD COUNT:", total_word_count)
print("SYLLABLE PER WORD:", syllable_per_word)
print("PERSONAL PRONOUNS:", personal_pronouns)
print("AVG WORD LENGTH:", total_word_length / total_word_count if total_word_count > 0 else 0)


POSITIVE SCORE: 10.167209595959596
NEGATIVE SCORE: 1.9416666666666667
POLARITY SCORE: 8.225542929292928
SUBJECTIVITY SCORE: 31.270112179487175
AVG SENTENCE LENGTH: 1837.0
PERCENTAGE OF COMPLEX WORDS: 20.30484485574306
FOG INDEX: 742.9219379422973
AVG NUMBER OF WORDS PER SENTENCE: 1837.0
COMPLEX WORD COUNT: 373
WORD COUNT: 1837
SYLLABLE PER WORD: 1.7729994556341862
PERSONAL PRONOUNS: 46
AVG WORD LENGTH: 0.22102858574349651


In [16]:
import openpyxl
import csv

# Function to remove problematic characters from a string
def remove_problematic_chars(text):
    # Define a set of characters that should be removed
    problematic_chars = set("+lìflµ5cck$}û9^2¶ØÑÒïO@­òB¹/Áyd0ÞM¶¢P~ÍD^DuD+cCjî]°Ò3ÀKuó°HRK0¿Y×j%T@I.\ðZ-xÿúÓ")
    return ''.join(char for char in text if char not in problematic_chars)

# Create a new Excel workbook
workbook = openpyxl.Workbook()
sheet = workbook.active

# Define column headers
headers = [
    "POSITIVE SCORE",
    "NEGATIVE SCORE",
    "POLARITY SCORE",
    "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH",
    "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT",
    "WORD COUNT",
    "SYLLABLE PER WORD",
    "PERSONAL PRONOUNS",
    "AVG WORD LENGTH",
]

# Write headers to the first row of the spreadsheet
for col_num, header in enumerate(headers, start=1):
    sheet.cell(row=1, column=col_num, value=header)

# Specify the full file path to your CSV data file
csv_file_path = '/content/Output Data Structure.xlsx'

# Read data from the CSV file with 'latin-1' encoding and 'ignore' error handling
with open(csv_file_path, 'r', encoding='latin-1', errors='ignore') as csv_file:
    # Preprocess each line to remove problematic characters
    cleaned_lines = [remove_problematic_chars(line) for line in csv_file]

    # Use the cleaned lines to create a CSV reader
    csv_reader = csv.reader(cleaned_lines)
    next(csv_reader)  # Skip the header row if it exists

    # Populate the spreadsheet with data from the CSV file
    for row_num, row_data in enumerate(csv_reader, start=2):
        for col_num, cell_value in enumerate(row_data, start=1):
            sheet.cell(row=row_num, column=col_num, value=cell_value)

# Specify the full file path for saving the Excel spreadsheet
xlsx_file_path = '/content/path'

# Save the spreadsheet to the specified file path
workbook.save(xlsx_file_path)

print(f"Excel spreadsheet '{xlsx_file_path}' has been created and populated with data from '{csv_file_path}'.")


Error: ignored