<a href="https://colab.research.google.com/github/Prabhusabharish/Data-Extraction-and-NLP/blob/main/Data_Extraction_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pandas openpyxl beautifulsoup4 requests nltk



In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Data Extract

In [4]:
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title
        title = soup.find('title').text.strip()

        # Extract article text
        paragraphs = soup.find_all('p')
        article_text = ' '.join([para.text for para in paragraphs])

        return title, article_text
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return None, None

Load data

In [5]:
df = pd.read_excel('/content/drive/MyDrive/Projects/Blackcoffer/Input.xlsx')
df.head(2)


Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...


Extract

In [6]:
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, article_text = extract_text_from_url(url)

    if title and article_text:
        with open(f"{url_id}.txt", 'w', encoding='utf-8') as file:
            file.write(title + '\n' + article_text)

Data Analysis : Positive and Negative Scores

In [7]:
# Reading the positive words file
with open('/content/drive/MyDrive/Projects/Blackcoffer/MasterDictionary/positive-words.txt', 'r', encoding='latin-1') as file:
    positive_words = set(file.read().split())

# Reading the negative words file
with open('/content/drive/MyDrive/Projects/Blackcoffer/MasterDictionary/negative-words.txt', 'r', encoding='latin-1') as file:
    negative_words = set(file.read().split())


Compute Sentiment Scores

In [8]:
def compute_sentiment_scores(text):
    words = word_tokenize(text.lower())
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

Polarity and Subjectivity Scores

In [9]:
def compute_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

def compute_subjectivity_score(positive_score, negative_score, word_count):
    return (positive_score + negative_score) / (word_count + 0.000001)

Average Sentence Length and Complex Words

In [10]:
def compute_avg_sentence_length(text):
    sentences = sent_tokenize(text)
    total_words = len(word_tokenize(text))
    return total_words / len(sentences)

def count_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2 and sum(1 for ch in word if ch in 'aeiou') > 1]
    return len(complex_words)


Fog Index

In [11]:
def compute_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

Word Count and Syllables Per Word

In [12]:
def compute_word_count(text):
    words = word_tokenize(text)
    return len(words)

def compute_syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(len([ch for ch in word if ch in 'aeiou']) for word in words)
    return syllable_count / len(words)

Personal Pronouns

In [13]:
def count_personal_pronouns(text):
    words = word_tokenize(text.lower())
    pronouns = ["i", "we", "my", "ours", "us"]
    return sum(1 for word in words if word in pronouns)

Average Word Length

In [14]:
def compute_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

Processing All Articles

In [15]:
output_data = []

for index, row in df.iterrows():
    url_id = row['URL_ID']

    try:
        with open(f"{url_id}.txt", 'r', encoding='utf-8') as file:
            text = file.read()

        title, article_text = text.split('\n', 1)

        positive_score, negative_score = compute_sentiment_scores(article_text)
        polarity_score = compute_polarity_score(positive_score, negative_score)
        subjectivity_score = compute_subjectivity_score(positive_score, negative_score, compute_word_count(article_text))
        avg_sentence_length = compute_avg_sentence_length(article_text)
        complex_word_count = count_complex_words(article_text)
        percentage_complex_words = (complex_word_count / compute_word_count(article_text)) * 100
        fog_index = compute_fog_index(avg_sentence_length, percentage_complex_words)
        syllables_per_word = compute_syllables_per_word(article_text)
        personal_pronouns = count_personal_pronouns(article_text)
        avg_word_length = compute_avg_word_length(article_text)

        output_data.append([
            url_id, row['URL'],
            positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_sentence_length, complex_word_count, compute_word_count(article_text),
            syllables_per_word, personal_pronouns, avg_word_length
        ])
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

output_df = pd.DataFrame(output_data, columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])

 Save Output

In [16]:
output_df.to_csv('Output.csv', index=False)