# **Name : Soham Santosh Jadhav**
# **TY B.Tech**
# **CGPA : 9.79**
# **College: MIT-WPU**
# **email : sai.sohamiit@gmail.com**

Step 1: Importing the required libraries

In [219]:
import pandas as pd
import nltk
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import os

Step 2: Load the Excel file

In [220]:
file_path = 'Input.xlsx'
sheet_name = 'Sheet1'

Step 3: Read the Excel file

In [221]:
df = pd.read_excel(file_path, sheet_name=sheet_name)

In [222]:
df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


In [223]:
urls = df.iloc[:, 1].tolist()

Step 4: Define a function for fetching article

In [224]:
def fetch_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')

        title = soup.find('title').get_text() if soup.find('title') else 'No title found'
        article_body = soup.find('article')

        if not article_body:
            for class_name in ['content', 'main-content', 'post-content', 'entry-content']:
                article_body = soup.find(class_=class_name)
                if article_body:
                    break

        # Extracting text
        if article_body:
            paragraphs = article_body.find_all('p')
            article_text = ' '.join([p.get_text() for p in paragraphs])
        else:
            article_text = 'No article content found'

        return {'title': title, 'text': article_text}

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {'title': 'Error', 'text': 'Error'}

Step 5: Fetching articles

In [225]:
articles = [fetch_article(url) for url in urls]


Error fetching https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error fetching https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


Step 6: Display articles

In [226]:
for i, article in enumerate(articles[:3]):
    print(f"Article {i+1} Title: {article['title']}")
    print(f"Article {i+1} Text: {article['text'][:500]}")

Article 1 Title: Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights
Article 1 Text: We have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040. 
Article 2 Title: Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights
Article 2 Text: Throughout history, from the industrial revolution in the 18th century through the development of the Internet, technology has been the primary driver of societal change. It h

In [227]:
nltk.download('punkt')
nltk.download('stopwords')

stopwords_path = r'/content/drive/MyDrive/Test Assignemnet/Blackcoffer/StopWords'
master_dict_path = r'/content/drive/MyDrive/Test Assignemnet/Blackcoffer/MasterDictionary'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Step 7: Set stopwords along with positive and negative words

In [228]:
stop_words = set()
for filename in os.listdir(stopwords_path):
    with open(os.path.join(stopwords_path, filename), 'r', encoding='ISO-8859-1') as file:
        stop_words.update(file.read().lower().split())

In [229]:
positive_words = set()
negative_words = set()
with open(os.path.join(master_dict_path, 'positive-words.txt'), 'r', encoding='ISO-8859-1') as file:
    positive_words.update(file.read().lower().split())
with open(os.path.join(master_dict_path, 'negative-words.txt'), 'r', encoding='ISO-8859-1') as file:
    negative_words.update(file.read().lower().split())

Step 8: Removing stopwords from positive and negative words

In [230]:
positive_words = positive_words - stop_words
negative_words = negative_words - stop_words

Step 9: Define functions for analysis

In [231]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop_words]
    return words

In [232]:
def sentiment_analysis(words):
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

In [233]:
def analysis_of_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    num_sentences = len(sentences)
    num_words = len(words)
    complex_words = [word for word in words if len([char for char in word if char in 'aeiou']) > 2]
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    percentage_complex_words = len(complex_words) / num_words if num_words > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index

In [234]:
def count_complex_words(words):
    return len([word for word in words if len([char for char in word if char in 'aeiou']) > 2])

In [235]:
def syllable_count(word):
    word = word.lower()
    syllables = len(re.findall(r'[aeiouy]+', word))
    if word.endswith('es') or word.endswith('ed'):
        syllables = max(1, syllables - 1)
    return syllables

In [236]:
def text_metrics(text):
    words = clean_text(text)
    num_words = len(words)
    syllable_counts = [syllable_count(word) for word in words]
    avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    return num_words, sum(syllable_counts), avg_word_length, personal_pronouns

In [237]:
data_output = []

In [238]:
for i, article in enumerate(articles):
    text = article['text']
    words = clean_text(text)
    positive_score, negative_score, polarity_score, subjectivity_score = sentiment_analysis(words)
    avg_sentence_length, percentage_complex_words, fog_index = analysis_of_readability(text)
    num_words, total_syllables, avg_word_length, personal_pronouns = text_metrics(text)

    data_output.append({
      'URL_ID': df.iloc[i, 0],
      'URL': urls[i],
      'TITLE': article['title'],
      'POSITIVE SCORE': positive_score,
      'NEGATIVE SCORE': negative_score,
      'POLARITY SCORE': polarity_score,
      'SUBJECTIVITY SCORE': subjectivity_score,
      'AVERAGE SENTENCE LENGTH': avg_sentence_length,
      'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
      'FOG INDEX': fog_index,
      'NUMBER OF WORDS': num_words,
      'TOTAL SYLLABLES': total_syllables,
      'PERSONAL PRONOUNS': personal_pronouns,
      'AVERAGE WORD LENGTH': avg_word_length
    })

Step 10: Print the values of evaluation metrics

In [239]:
print(f"Positive Score: {positive_score}")
print(f"Negative Score: {negative_score}")
print(f"Polarity Score: {polarity_score}")
print(f"Subjectivity Score: {subjectivity_score}")
print(f"Average Sentence Length: {avg_sentence_length}")
print(f"Percentage of Complex Words: {percentage_complex_words}")
print(f"Fog Index: {fog_index}")
print(f"Number of Words: {num_words}")
print(f"Total Syllables: {total_syllables}")
print(f"Personal Pronouns: {personal_pronouns}")
print(f"Average Word Length: {avg_word_length}")



Positive Score: 28
Negative Score: 54
Polarity Score: -0.3170731668649614
Subjectivity Score: 0.19617224833451616
Average Sentence Length: 34.333333333333336
Percentage of Complex Words: 0.2436893203883495
Fog Index: 13.830809061488674
Number of Words: 418
Total Syllables: 972
Personal Pronouns: 3
Average Word Length: 7.366028708133971


Step 11: Generate the output in Excel sheet

In [240]:
# Prepare the output DataFrame
df_output = pd.DataFrame(data_output)

In [241]:
# Save the output to an Excel file
output_filepath = '/content/drive/MyDrive/Test Assignemnet/Blackcoffer/Output_Analysis_Blackcoffer.xlsx'
df_output.to_excel(output_filepath, index=False)

print(f"Successfully saved to {output_filepath}")

Successfully saved to /content/drive/MyDrive/Test Assignemnet/Blackcoffer/Output_Analysis_Blackcoffer.xlsx
