In [None]:
!pip install xlrd
!pip install requests
!pip install beautifulsoup4
!pip install textblob
!pip install nltk
!pip install textstat

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

def read_all_sheets_from_excel(excel_file):
    """
    Reads all sheets from an Excel file and returns a dictionary of dataframes.

    Args:
        excel_file (str): Path to the Excel file.

    Returns:
        dict: A dictionary where the keys are the sheet names and the values are the corresponding dataframes.
    """
    xl = pd.ExcelFile(excel_file)
    dataframes = {sheet_name: xl.parse(sheet_name) for sheet_name in xl.sheet_names}
    return dataframes

dataframes = read_all_sheets_from_excel('Input.xlsx')

# print each dataframe name
print("Dataframes in this file:", ", ".join(dataframes.keys()))

for k, v in dataframes.items():
    print(v.head())

# Function to extract and save article text
def extract_and_save_article(url, url_id):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Assuming the article title is within <h1> tags and the text within <p> tags
        # This may need adjustment based on the actual structure of the target webpages
        article_title = soup.find('h1').get_text()
        article_text = ' '.join([p.get_text() for p in soup.find_all('p')])

        # Combine title and text
        full_text = article_title + '\n\n' + article_text

        # Save to a text file named after the URL_ID
        with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(full_text)

        print(f'Article {url_id} saved successfully.')
    except Exception as e:
        print(f'Error extracting article {url_id}: {e}')

# Loop through each URL in the DataFrame
for index, row in dataframes['Sheet1'].iterrows():
    extract_and_save_article(row['URL'], row['URL_ID'])

# Function to compute scores
def compute_scores(text):
    # Use TextBlob for subjectivity and polarity
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity

    # Initialize Sentiment Intensity Analyzer
    sid = SentimentIntensityAnalyzer()

    # Use NLTK VADER for positive and negative scores
    sentiment_scores = sid.polarity_scores(text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score)
    subjectivity_score = (positive_score + negative_score) / len(blob.words)
    avg_sentence_length = len(blob.words) / len(blob.sentences)
    complex_words = sum(1 for word in blob.words if textstat.syllable_count(word) >= 3)
    percentage_complex_words = complex_words / len(blob.words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = avg_sentence_length  # This is the same as avg_sentence_length
    word_count = len(blob.words)
    syllable_per_word = sum(textstat.syllable_count(word) for word in blob.words) / len(blob.words)
    personal_pronouns = sum(1 for word in blob.words if word.lower() in ['i', 'we', 'you', 'he', 'she', 'they'])
    avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)

    return positive_score, negative_score, polarity, subjectivity, avg_sentence_length, percentage_complex_words, \
           fog_index, avg_words_per_sentence, word_count, syllable_per_word, personal_pronouns, avg_word_length

# Assuming article texts are saved in the current directory with their URL_ID as filenames
article_files = [f for f in os.listdir('.') if f.endswith('.txt')]

# Prepare the final dataset
data = []
for file in article_files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        pos_score, neg_score, polarity, subjectivity, avg_sentence_length, percentage_complex_words, fog_index, \
        avg_words_per_sentence, word_count, syllable_per_word, personal_pronouns, avg_word_length = compute_scores(
            text)
        data.append({
            'URL_ID': file.replace('.txt', ''),
            'Positive Score': pos_score,
            'Negative Score': neg_score,
            'Polarity Score': polarity,
            'Subjectivity Score': subjectivity,
            'Average Sentence Length': avg_sentence_length,
            'Percentage of Complex Words': percentage_complex_words,
            'Fog Index': fog_index,
            'Word Count': word_count,
            'Syllable per Word': syllable_per_word,
            'Personal Pronouns': personal_pronouns,
            'Average Word Length': avg_word_length
        })

# Convert to DataFrame
final_df = pd.DataFrame(data)

# Optionally, merge with the original DataFrame to include URLs
# original_df = pd.read_excel('Input.xlsx')
# final_df = pd.merge(original_df, final_df, on='URL_ID')

# Save the final dataset to a new Excel file
final_df.to_excel('final_dataset.xlsx', index=False)

print("Final dataset prepared and saved as 'final_dataset.xlsx'.")


Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Dataframes in this file: Sheet1
            URL_ID                                                URL
0  blackassign0001  https://insights.blackcoffer.com/rising-it-cit...
1  blackassign0002  https://insights.blackcoffer.com/rising-it-cit...
2  blackassign0003  https://insights.blackcoffer.com/internet-dema...
3  blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...
4  blackassign0005  https://insights.blackcoffer.com/ott-platform-...
Article blackassign0001 saved successfully.
Article blackassign0002 saved successfully.
Article blackassign0003 saved successfully.
Article blackassign0004 saved successfully.
Article blackassign0005 saved successfully.
Article blackassign0006 saved successfully.
Article blackassign0007 saved successfully.
Article blackassign0008 saved successfully.
Article blackassign0009 saved successfully.
Article blackassign0010 saved successfully.
Article blackassign0011 saved successfully.
Article blackassign0012 saved successfully.
Article blackassign0