# Blackcoffer - NLP and Data Extraction

In [1]:
!pip install XlsxWriter
!pip install textstat
!pip install beautifulsoup4 requests openpyxl
!pip install beautifulsoup4 requests openpyxl nltk
!pip install pandas
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import textstat
import pandas as pd



In [29]:
import os
import requests
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from textstat import syllable_count
from collections import Counter
import openpyxl
import re

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Function to extract article text from a given URL

In [37]:

def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting title and article text
        title = soup.title.text.strip()
        article_text = ' '.join([p.text.strip() for p in soup.find_all('p')])

        return title, article_text
    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None, None

### Function to read stop words from multiple files in a folder

In [38]:
def read_stop_words(folder_path):
    stop_words = set()
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='latin-1') as file:
            stop_words.update(word.strip() for word in file.readlines())
    return stop_words

### Function to read positive and negative words from separate files

In [39]:
def read_master_dictionary(positive_file, negative_file, stop_words):
    positive_words = set()
    with open(positive_file, 'r', encoding='latin-1') as file:
        positive_words.update(word.strip() for word in file.readlines() if word.lower() not in stop_words)

    negative_words = set()
    with open(negative_file, 'r', encoding='latin-1') as file:
        negative_words.update(word.strip() for word in file.readlines() if word.lower() not in stop_words)

    return positive_words, negative_words

### Function to perform textual analysis and compute variables

In [40]:
def perform_textual_analysis(article_text, stop_words, positive_words, negative_words):
    # Tokenize words and sentences
    words = word_tokenize(article_text)
    sentences = sent_tokenize(article_text)

    # Remove stop words and punctuation
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Calculate word count
    word_count = len(words)

    # Create positive and negative score dictionaries
    positive_score_dict = Counter({word: 1 for word in words if word in positive_words})
    negative_score_dict = Counter({word: -1 for word in words if word in negative_words})

    # Calculate positive and negative scores
    positive_score = sum(positive_score_dict.values())
    negative_score = sum(negative_score_dict.values())

    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Calculate subjectivity score
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

    # Calculate average sentence length
    avg_sentence_length = sum(len(sent.split()) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0

    # Calculate percentage of complex words
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = word_count / len(sentences) if len(sentences) > 0 else 0
    
    # Calculate Syllable Count Per Word
    syllables_per_word = sum(syllable_count(word) for word in words) / word_count if word_count > 0 else 0

    # Calculate Personal Pronouns
    personal_pronouns_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', article_text, flags=re.IGNORECASE))

    # Calculate Average Word Length
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0

    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
           percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, \
           syllables_per_word, personal_pronouns_count, avg_word_length

    #return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
           #percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count

In [41]:
# Read stop words from the StopWords folder
stop_words_folder = "StopWords"
stop_words = read_stop_words(stop_words_folder)

In [42]:
# Read positive and negative words from the MasterDictionary
positive_file = "MasterDictionary/positive-words.txt"
negative_file = "MasterDictionary/negative-words.txt"
positive_words, negative_words = read_master_dictionary(positive_file, negative_file, stop_words)

### Read the input Excel file

In [43]:
input_file = "Input.xlsx"
input_workbook = openpyxl.load_workbook(input_file)
input_sheet = input_workbook.active

In [44]:
# Create an output workbook
output_workbook = openpyxl.Workbook()
output_sheet = output_workbook.active

In [45]:
# Write headers to the output sheet
output_sheet.append(["URL_ID", "Title", "Positive Score", "Negative Score", "Polarity Score",
                     "Subjectivity Score", "Avg Sentence Length", "Percentage of Complex Words",
                     "Fog Index", "Avg Words per Sentence", "Complex Word Count", "Word Count","Syllables Per Word", "Personal Pronouns Count", "Avg Word Length"])

### Iterate through rows in the Input Excel file

In [46]:
for row in input_sheet.iter_rows(min_row=2, values_only=True):
    url_id, url = row
    print(f"Processing {url_id} - {url}")

    # Extract article text
    title, article_text = extract_article_text(url)

    # Perform textual analysis
    if title and article_text:
        positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
        percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, \
        syllables_per_word, personal_pronouns_count, avg_word_length = \
            perform_textual_analysis(article_text, stop_words, positive_words, negative_words)

        # Write results to the output sheet
        output_sheet.append([url_id, title, positive_score, negative_score, polarity_score,
                             subjectivity_score, avg_sentence_length, percentage_complex_words,
                             fog_index, avg_words_per_sentence, complex_word_count, word_count,
                             syllables_per_word, personal_pronouns_count, avg_word_length])
        print(f"Data extracted and analyzed for {url_id}")
    else:
        print(f"Failed to extract data for {url_id}")

Processing blackassign0001 - https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/
Data extracted and analyzed for blackassign0001
Processing blackassign0002 - https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/
Data extracted and analyzed for blackassign0002
Processing blackassign0003 - https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/
Data extracted and analyzed for blackassign0003
Processing blackassign0004 - https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/
Data extracted and analyzed for blackassign0004
Processing blackassign0005 - https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/
Data extracted and analyzed for blackassign0005
Processing blackassign0006 - https://i

Data extracted and analyzed for blackassign0048
Processing blackassign0049 - https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Data extracted and analyzed for blackassign0049
Processing blackassign0050 - https://insights.blackcoffer.com/environmental-impact-of-the-covid-19-pandemic-lesson-for-the-future/
Data extracted and analyzed for blackassign0050
Processing blackassign0051 - https://insights.blackcoffer.com/how-data-analytics-and-ai-are-used-to-halt-the-covid-19-pandemic/
Data extracted and analyzed for blackassign0051
Processing blackassign0052 - https://insights.blackcoffer.com/difference-between-artificial-intelligence-machine-learning-statistics-and-data-mining/
Data extracted and analyzed for blackassign0052
Processing blackassign0053 - https://insights.blackcoffer.com/how-python-became-the-first-choice-for-data-science/
Data extracted and analyzed for blackassign0053
Processing blackassign0054 - https://insights.blackcoffer.com/how-google-fit-mea

Data extracted and analyzed for blackassign0098
Processing blackassign0099 - https://insights.blackcoffer.com/how-covid-19-is-impacting-payment-preferences/
Data extracted and analyzed for blackassign0099
Processing blackassign0100 - https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/
Data extracted and analyzed for blackassign0100


### Save the output workbook

In [47]:
output_workbook.save("Output Data Structure.xlsx")

In [48]:
# Close the input workbook
input_workbook.close()