In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
title = soup.find('h1', class_='entry-title').text

In [4]:
article_text = soup.find('div', class_='td-post-content').text

In [5]:
print("Title:", title)
print("Article Text:", article_text[:1500])

Title: Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.
Article Text: 
We have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040.
Rising IT cities

Noida:- Noida in Uttar Pradesh near New Delhi is an emerging IT sector now. Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here. Noida has a market base of billions of dollars and is doing a great job of boosting the national economy. The establishment of so many software companies has made Noida an 

In [6]:
# Reading files from
with open('positive-words.txt', 'r') as file:
    positive_words = file.read().splitlines()

with open('negative-words.txt', 'r') as file:
    negative_words = file.read().splitlines()

In [7]:
positive_words[:10]

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation']

In [8]:
negative_words[:10]

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted']

In [9]:
from nltk.tokenize import word_tokenize
import nltk

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Positive score,neg score without discarding stop words

In [10]:
words_before_sw = word_tokenize(article_text)

In [11]:
words_before_sw[:20]

['We',
 'have',
 'seen',
 'a',
 'huge',
 'development',
 'and',
 'dependence',
 'of',
 'people',
 'on',
 'technology',
 'in',
 'recent',
 'years',
 '.',
 'We',
 'have',
 'also',
 'seen']

In [12]:
positive_words_set = set(positive_words)
negative_words_set = set(negative_words)

positive_score = sum(1 for word in words_before_sw if word.lower() in positive_words_set)
negative_score = sum(1 for word in words_before_sw if word.lower() in negative_words_set)

print("Positive Score:", positive_score)
print("Negative Score:", negative_score)

Positive Score: 44
Negative Score: 6


# REMOVE STOP WORDS AND DOING THE SAME THING ....

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

In [15]:
stop_words = set(stopwords.words('english'))
words = word_tokenize(article_text)
words_after_sw = [word for word in words if word.lower() not in stop_words]

In [16]:
##we can see punctuation is not discarded while removing stop words...
words_after_sw[:20]

['seen',
 'huge',
 'development',
 'dependence',
 'people',
 'technology',
 'recent',
 'years',
 '.',
 'also',
 'seen',
 'development',
 'AI',
 'ChatGPT',
 'recent',
 'years',
 '.',
 'normal',
 'thing',
 'become']

In [17]:
positive_score_aftersw = sum(1 for word in words_after_sw if word.lower() in positive_words_set)
negative_score_aftersw = sum(1 for word in words_after_sw if word.lower() in negative_words_set)

print("Positive Score:", positive_score_aftersw)
print("Negative Score:", negative_score_aftersw)

Positive Score: 44
Negative Score: 6


# POLARITY SCORE and SUBJECTIVITY SCORE

In [19]:
polarity_score = (positive_score_aftersw - negative_score_aftersw) / (positive_score_aftersw + negative_score_aftersw + 0.000001)

subjectivity_score = (positive_score_aftersw +  negative_score_aftersw)/ (len(words_after_sw) + 0.000001)


print("Polarity Score:", polarity_score)
print("Subjective Score:",subjectivity_score)

Polarity Score: 0.7599999848000003
Subjective Score: 0.06305170231645435


# NUMBER OF SENTENCES 

In [23]:
from nltk.tokenize import word_tokenize, sent_tokenize
sentences = sent_tokenize(article_text)
sentences[:10]

['\nWe have seen a huge development and dependence of people on technology in recent years.',
 'We have also seen the development of AI and ChatGPT in recent years.',
 'So it is a normal thing that we will become fully dependent on technology by 2040.',
 'Information technology will be a major power for all the developing nations.',
 'As a member of a developing nation, India is rapidly growing its IT base.',
 'It has also grown some IT cities which will be the major control centres for Information technology by 2040.',
 'Rising IT cities\n\nNoida:- Noida in Uttar Pradesh near New Delhi is an emerging IT sector now.',
 'Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here.',
 'Noida has a market base of billions of dollars and is doing a great job of boosting the national economy.',
 'The establishment of so many software companies has made Noida an information technology hub.']

In [24]:
len(sentences)

78

# AVERAGE NUMBER OF WORDS PER SENTENCE

In [25]:
average_num_of_wrds_persent= len(words_after_sw) /len(sentences)
print(average_num_of_wrds_persent)


10.166666666666666


# AVERAGE WORD LENGTH

In [26]:
def sum_of_char_in_each_word(words_after_sw):
    sum=0
    for word in words_after_sw:
        sum=sum+len(word)
    return sum

In [28]:
##example testing :
ex_sentence="We have seen how things work in NLP and now it's time to work on projects !!"
ex_words=word_tokenize(ex_sentence)
ex_words_after_sw = [word for word in ex_words if word.lower() not in stop_words]

In [29]:
ex_words_after_sw

['seen', 'things', 'work', 'NLP', "'s", 'time', 'work', 'projects', '!', '!']

In [30]:
print(f"Average word length:{sum_of_char_in_each_word(ex_words_after_sw)/len(ex_words_after_sw)}")

Average word length:3.7


# remove punctuation

In [33]:
import re

def remove_punctuation(sentence):
    pattern = r'[^\w\s]'
    # Replace punctuation with an empty string
    clean_sentence = re.sub(pattern, '', sentence)
    return clean_sentence

In [35]:
clean_sent=remove_punctuation(ex_sentence)
print(clean_sent)

We have seen how things work in NLP and now its time to work on projects 


# sent ex_sentence to remove punctuation and now again to avg word length: 

In [36]:
clean_words=word_tokenize(clean_sent)
clean_words_after_sw = [word for word in clean_words if word.lower() not in stop_words]
clean_words_after_sw

['seen', 'things', 'work', 'NLP', 'time', 'work', 'projects']

In [37]:
print(f"Average word length:{sum_of_char_in_each_word(clean_words_after_sw)/len(clean_words_after_sw)}")

Average word length:4.714285714285714
