In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup


nltk.download('punkt')
nltk.download('stopwords')


def load_master_dictionary():
    with open('/content/drive/MyDrive/positive-words.txt', 'r', encoding='latin-1') as file:
        positive_words = set(file.read().splitlines())

    with open('/content/drive/MyDrive/negative-words.txt', 'r', encoding='latin-1') as file:
         negative_words = set(file.read().splitlines())

    return positive_words, negative_words


def load_custom_stopwords():
    stopwords_set = set()


    with open('/content/drive/MyDrive/StopWords/StopWords_Auditor.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_Currencies.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_DatesandNumbers.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_Generic.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_GenericLong.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_Geographic.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    with open('/content/drive/MyDrive/StopWords/StopWords_Names.txt', 'r', encoding='latin-1') as file:
        stopwords_set.update(file.read().splitlines())

    return stopwords_set

def clean_text(text):

    nltk_stopwords = set(stopwords.words('english'))
    custom_stopwords = load_custom_stopwords()

    all_stopwords = nltk_stopwords.union(custom_stopwords)

    words = word_tokenize(text)

    cleaned_words = [word for word in words if word.isalnum() and word.lower() not in all_stopwords]

    return cleaned_words

def sentiment_analysis(cleaned_words, positive_words, negative_words):
    positive_score = sum(1 for word in cleaned_words if word.lower() in positive_words)
    negative_score = sum(1 for word in cleaned_words if word.lower() in negative_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

def readability_analysis(text):
    sentences = sent_tokenize(text)
    total_words = word_tokenize(text)

    avg_sentence_length = len(total_words) / len(sentences) if len(sentences) > 0 else 0

    complex_words = [word for word in total_words if count_syllables(word) > 2]
    percentage_complex_words = len(complex_words) / len(total_words) if len(total_words) > 0 else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    return avg_sentence_length, percentage_complex_words, fog_index

def count_syllables(word):
    word = word.lower()
    vowels = "aeiou"
    count = sum(1 for char in word if char in vowels)
    if word.endswith(('es', 'ed')):
        count -= 1
    return max(count, 1)

def word_analysis(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    word_count = len(words)
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

    total_characters = sum(len(word) for word in words)
    avg_word_length = total_characters / len(words) if len(words) > 0 else 0

    return len(sentences), word_count, personal_pronouns, avg_word_length

def extract_article_text(url):
    try:

        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else ""

        article_body = soup.find('article').get_text(separator=' ', strip=True) if soup.find('article') else ""

        full_text = title + "\n\n" + article_body

        return full_text

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return ""

input_df = pd.read_excel('/content/drive/MyDrive/Input.xlsx')

positive_words, negative_words = load_master_dictionary()

output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
                  'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                  'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH', 'URL']
output_df = pd.DataFrame(columns=output_columns)

for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    article_url = row['URL']

    article_text = extract_article_text(article_url)

    cleaned_words = clean_text(article_text)

    positive_score, negative_score, polarity_score, subjectivity_score = sentiment_analysis(cleaned_words, positive_words, negative_words)

    avg_sentence_length, percentage_complex_words, fog_index = readability_analysis(article_text)

    num_sentences, word_count, personal_pronouns, avg_word_length = word_analysis(article_text)

    complex_word_count = sum(1 for word in cleaned_words if count_syllables(word) > 2)

    syllables_per_word = sum(count_syllables(word) for word in cleaned_words) / len(cleaned_words) if len(cleaned_words) > 0 else 0

    new_row = pd.DataFrame({
        'URL_ID': [url_id],
        'POSITIVE SCORE': [positive_score],
        'NEGATIVE SCORE': [negative_score],
        'POLARITY SCORE': [polarity_score],
        'SUBJECTIVITY SCORE': [subjectivity_score],
        'AVG SENTENCE LENGTH': [avg_sentence_length],
        'PERCENTAGE OF COMPLEX WORDS': [percentage_complex_words],
        'FOG INDEX': [fog_index],
        'AVG NUMBER OF WORDS PER SENTENCE': [avg_sentence_length],
        'COMPLEX WORD COUNT': [complex_word_count],
        'WORD COUNT': [word_count],
        'SYLLABLE PER WORD': [syllables_per_word],
        'PERSONAL PRONOUNS': [personal_pronouns],
        'AVG WORD LENGTH': [avg_word_length],
        'URL': [article_url]
    })
    output_df = pd.concat([output_df, new_row], ignore_index=True)

output_df.to_excel('/content/drive/MyDrive/Output_Structure.xlsx', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  output_df = pd.concat([output_df, new_row], ignore_index=True)


Failed to retrieve https://insights.blackcoffer.com/monday-com-to-kpi-dashboard-to-manage-view-and-generate-insights-from-the-crm-data/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/monday-com-to-kpi-dashboard-to-manage-view-and-generate-insights-from-the-crm-data/
