In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# loading the files we will be wanting ahead the master dictionary...and keep its set prepared

with open('positive-words.txt', 'r') as file:
    positive_words = file.read().splitlines()

with open('negative-words.txt', 'r') as file:
    negative_words = file.read().splitlines()
    
positive_words_set = set(positive_words)
negative_words_set = set(negative_words)

In [3]:
def scrape(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title_element = soup.find('h1', class_='entry-title')
        article_element = soup.find('div', class_='td-post-content')
        
        title = title_element.text if title_element else 'Nill'
        article_text = article_element.text if article_element else 'Nill'
        
        return title, article_text
    except requests.exceptions.RequestException as e:
        print(f"Request error for URL {url}: {e}")
        return 'No Title Found', 'No Article Text Found'
    except Exception as e:
        print(f"An error occurred while scraping URL {url}: {e}")
        return 'No Title Found', 'No Article Text Found'
    

In [4]:
def remove_punctuation(text):
    pattern = r'[^\w\s]'
    # Replace punctuation with an empty string
    clean_sentence = re.sub(pattern, '', text)
    return clean_sentence

In [5]:
def clean_it(article_text):
    clean_sent=remove_punctuation(article_text)
    stop_words = set(stopwords.words('english'))
    clean_words=word_tokenize(clean_sent)
    clean_words_list_after_sw = [word for word in clean_words if word.lower() not in stop_words]
    return clean_sent,clean_words_list_after_sw
    

# EXAMPLE TESTING 

In [6]:
url = "https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/"
title,article_text=scrape(url)
print(title)
print(article_text[:100])
clean_sentence,clean_words_list=clean_it(article_text[:100])
print(clean_sentence)
print(clean_words_list)

Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.

We have seen a huge development and dependence of people on technology in recent years. We have als

We have seen a huge development and dependence of people on technology in recent years We have als
['seen', 'huge', 'development', 'dependence', 'people', 'technology', 'recent', 'years', 'als']


In [7]:
def scores(clean_words_list,positive_words_set,negative_words_set):
    positive_score = sum(1 for word in clean_words_list if word.lower() in positive_words_set)
    negative_score = sum(1 for word in clean_words_list if word.lower() in negative_words_set)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score +  negative_score)/ (len(clean_words_list) + 0.000001)
    return positive_score,negative_score,polarity_score,subjectivity_score

# EXAMPLE TESTING

In [8]:
url = "https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/"
title,article_text=scrape(url)
print(title)
print(article_text[:100])
clean_sentence,clean_words_list=clean_it(article_text)
print(clean_sentence)
print(clean_words_list)
p,n,po,su=scores(clean_words_list,positive_words_set,negative_words_set)
print(f"Scores are{p,n,po,su}")

Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.

We have seen a huge development and dependence of people on technology in recent years. We have als

We have seen a huge development and dependence of people on technology in recent years We have also seen the development of AI and ChatGPT in recent years So it is a normal thing that we will become fully dependent on technology by 2040 Information technology will be a major power for all the developing nations As a member of a developing nation India is rapidly growing its IT base It has also grown some IT cities which will be the major control centres for Information technology by 2040
Rising IT cities

Noida Noida in Uttar Pradesh near New Delhi is an emerging IT sector now Many large companies like Google Microsoft IBM Infosys and others have set up their companies here Noida has a market base of billions of dollars and is doing a great job of boosting the national economy T

In [9]:
#THIS IS WHEN NO STOP WORDS WERE REMOVED NO PUNCTUATION WERE REMOVED EVERYTHING WAS AS IT IS .
def average_sentence_length(article_text):
    sentences = sent_tokenize(article_text)
    words=word_tokenize(article_text)
    average_num_of_wrds_present= len(words) /len(sentences)
    return average_num_of_wrds_present

In [10]:
def average_word_length(clean_words_list):
    sum=0
    for word in clean_words_list:
        sum=sum+len(word)
    avg_word_count=sum/len(clean_words_list)
    return avg_word_count

In [11]:
def count_personal_pronouns(text):
    text = text.lower()
    pronouns = r'\b(i|me|my|mine|we|us|our|ours)\b'
    matches = re.findall(pronouns, text)
    pronoun_count = len(matches)
    return pronoun_count


# EXAMPLE TESTING

In [12]:
url = "https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/"
title,article_text=scrape(url)
print(title)
print(article_text[:100])
clean_sentence,clean_words_list=clean_it(article_text)
print(clean_sentence[:10])
print(clean_words_list)
p,n,po,su=scores(clean_words_list,positive_words_set,negative_words_set)
print(f"Scores are:{p,n,po,su}")
ans1=average_sentence_length(article_text)
print(ans1)
ans2=average_word_length(clean_words_list)
print(ans2)
ans3=count_personal_pronouns(article_text)
print(ans3)


Rise of Cyber Crime and its Effects

Cybercrime is the most discussed problem in the twenty-first century. The usage of cell phones and 

Cybercrim
['Cybercrime', 'discussed', 'problem', 'twentyfirst', 'century', 'usage', 'cell', 'phones', 'internet', 'increasing', 'dramatically', 'world', 'generating', 'questions', 'consumers', 'security', 'privacy', 'users', 'must', 'understand', 'cybercrime', 'security', 'Cybercrime', 'defined', 'organised', 'criminal', 'conduct', 'carried', 'attackers', 'online', 'Cybercrime', 'comes', 'numerous', 'forms', 'fraud', 'computer', 'viruses', 'cyberstalking', 'others', 'Due', 'businesses', 'government', 'organisations', 'spending', 'maintaining', 'employing', 'professionals', 'cybercrime', '7', 'Cyber', 'security', 'keywords', '1', 'Artificial', 'Intelligence', 'AI', 'around', 'quite', 'one', 'memorable', 'advancements', 'Go', 'game', 'AlphaGo', 'computer', 'Go', 'program', 'developed', 'Google', 'DeepMind', 'world', 'champion', 'Lee', 'Sedol', 'AlphaGo

In [13]:
def count_syllables(word):
    word = word.lower()
    vowels = "aeiouAEIOU"
    num_vowels = 0
    prev_char_was_vowel = False

    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                num_vowels += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    # Adjust for 'e' at the end
    if word.endswith('e'):
        num_vowels -= 1

    return max(1, num_vowels)

def find_complex_words_from_list(words):
    # Identify complex words
    complex_words = [word for word in words if count_syllables(word) > 2]
    return complex_words

# TEST FOR COMPLEX WORD FUNCTION

In [14]:
words_list = ["I", "love", "programming", "It's", "both", "challenging", "and", "rewarding", "Let's", "write", "some", "extraordinary", "code"]

# Find complex words from the list
complex_words = find_complex_words_from_list(words_list)
print("Complex Words:", complex_words)

Complex Words: ['programming', 'challenging', 'rewarding', 'extraordinary']


# TIME TO FILL THE OUTPUT

In [15]:
import pandas as pd

# Read the existing Excel file
file_path = 'Output Data Structure.xlsx'
df = pd.read_excel(file_path)

# Iterate over each URL in the Excel file
for index, row in df.iterrows():
    url = row['URL']  # Assuming 'URL' is the column name containing URLs
    title,article_text=scrape(url)
    clean_sentence,clean_words_list=clean_it(article_text)
    
    pos_score,neg_score,polarity,subjectivity=scores(clean_words_list,positive_words_set,negative_words_set)
    df.at[index,'POSITIVE SCORE'] = pos_score
    df.at[index,'NEGATIVE SCORE'] = neg_score
    df.at[index,'POLARITY SCORE'] = polarity
    df.at[index,'SUBJECTIVITY SCORE'] = subjectivity
    
    ans1=average_sentence_length(article_text)
    df.at[index,'AVG SENTENCE LENGTH']=ans1
    df.at[index,'AVG NUMBER OF WORDS PER SENTENCE']=ans1
    
    number_of_cmpwords=find_complex_words_from_list(clean_words_list)
    df.at[index,'COMPLEX WORD COUNT']=len(number_of_cmpwords)
    
    perc_cmpwords=len(number_of_cmpwords)/len(clean_words_list)
    df.at[index,'PERCENTAGE OF COMPLEX WORDS']=perc_cmpwords
    
    fog_index= 0.4 * (ans1 + perc_cmpwords)
    df.at[index,'FOG INDEX']=fog_index

    ans2=average_word_length(clean_words_list)
    df.at[index,'AVG WORD LENGTH']=ans2
    
    ans3=count_personal_pronouns(article_text)
    df.at[index,'PERSONAL PRONOUNS']=ans3
    
    ans4=len(clean_words_list)
    df.at[index,'WORD COUNT']=ans4
    
    df.to_excel(file_path, index=False)

print("Done Filling ")


Request error for URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Request error for URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Done Filling 


In [17]:
output = pd.read_excel(file_path)
print(output)

             URL_ID                                                URL  \
0   blackassign0001  https://insights.blackcoffer.com/rising-it-cit...   
1   blackassign0002  https://insights.blackcoffer.com/rising-it-cit...   
2   blackassign0003  https://insights.blackcoffer.com/internet-dema...   
3   blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...   
4   blackassign0005  https://insights.blackcoffer.com/ott-platform-...   
..              ...                                                ...   
95  blackassign0096  https://insights.blackcoffer.com/what-is-the-r...   
96  blackassign0097  https://insights.blackcoffer.com/impact-of-cov...   
97  blackassign0098  https://insights.blackcoffer.com/contribution-...   
98  blackassign0099  https://insights.blackcoffer.com/how-covid-19-...   
99  blackassign0100  https://insights.blackcoffer.com/how-will-covi...   

    POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCORE  SUBJECTIVITY SCORE  \
0               44               6   