<a href="https://colab.research.google.com/github/Rishavsagar/Data-Extraction-and-NLP/blob/main/Data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load Input.xlsx
input_data = pd.read_excel("Input.xlsx")

# Function to extract title and article text
def extract_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        title = soup.find("h1").get_text(strip=True)
        paragraphs = soup.find_all("p")
        article_text = " ".join([p.get_text(strip=True) for p in paragraphs])
        return title, article_text
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return None, None

# Extract data for each URL
extracted_data = []
for index, row in input_data.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    title, text = extract_text(url)
    extracted_data.append({"URL_ID": url_id, "Title": title, "Text": text})

# Save extracted data to a DataFrame
extracted_df = pd.DataFrame(extracted_data)
print(extracted_df.head())


            URL_ID                                              Title  \
0  Netclan20241017  AI and ML-Based YouTube Analytics and Content ...   
1  Netclan20241018  Enhancing Front-End Features and Functionality...   
2  Netclan20241019  ROAS Dashboard for Campaign-Wise Google Ads Bu...   
3  Netclan20241020  Efficient Processing and Analysis of Financial...   
4  Netclan20241021      Development of EA Robot for Automated Trading   

                                                Text  
0  Transforming Real Estate Investments with an A...  
1  Transforming Real Estate Investments with an A...  
2  Transforming Real Estate Investments with an A...  
3  Transforming Real Estate Investments with an A...  
4  Transforming Real Estate Investments with an A...  


In [None]:
# Save the extracted text into .txt
for index, row in extracted_df.iterrows():
    if row["Text"]:
        with open(f"{row['URL_ID']}.txt", "w", encoding="utf-8") as file:
            file.write(f"{row['Title']}\n{row['Text']}")


In [None]:
# Install required libraries
!pip install textstat nltk

# Download NLTK resources
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")


Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Create the MasterDictionary folder
import os
os.makedirs("MasterDictionary", exist_ok=True)

# Download the positive and negative word lists
!wget -O MasterDictionary/positive-words.txt https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt
!wget -O MasterDictionary/negative-words.txt https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt


--2025-04-10 12:28:57--  https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20630 (20K) [text/plain]
Saving to: ‘MasterDictionary/positive-words.txt’


2025-04-10 12:28:57 (13.0 MB/s) - ‘MasterDictionary/positive-words.txt’ saved [20630/20630]

--2025-04-10 12:28:58--  https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443.

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import textstat
import re

# Load positive and negative word lists
def load_words(filepath):
    with open(filepath, "r", encoding="ISO-8859-1") as f:
        return set([line.strip().lower() for line in f if line.strip() and not line.startswith(";")])

positive_words = load_words("MasterDictionary/positive-words.txt")
negative_words = load_words("MasterDictionary/negative-words.txt")

def analyze_text(text):
    # Tokenize
    words = word_tokenize(text.lower())
    sentences = sent_tokenize(text)

    # Stopword removal
    stop_words = set(nltk.corpus.stopwords.words("english"))
    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words]

    # Sentiment
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    # Readability
    avg_sentence_length = len(words) / len(sentences) if sentences else 0
    percentage_complex_words = sum(1 for word in cleaned_words if textstat.syllable_count(word) > 2) / len(cleaned_words) * 100 if cleaned_words else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Other
    complex_word_count = sum(1 for word in cleaned_words if textstat.syllable_count(word) > 2)
    syllables_per_word = sum(textstat.syllable_count(word) for word in cleaned_words) / len(cleaned_words) if cleaned_words else 0
    personal_pronouns = len(re.findall(r"\b(I|we|my|ours|us)\b", text, re.I))
    avg_word_length = sum(len(word) for word in cleaned_words) / len(cleaned_words) if cleaned_words else 0

    return [
        positive_score,
        negative_score,
        polarity_score,
        subjectivity_score,
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_sentence_length,
        complex_word_count,
        len(word_tokenize(text)),
        syllables_per_word,
        personal_pronouns,
        avg_word_length
    ]


In [None]:
import pandas as pd
import os

# Load input file
df_input = pd.read_excel("Input.xlsx")

# Define output structure
output_columns = [
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]

output_rows = []

for _, row in df_input.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        with open(f"{url_id}.txt", "r", encoding="utf-8") as f:
            text = f.read()
            metrics = analyze_text(text)
            output_rows.append([url_id, url] + metrics)
    except FileNotFoundError:
        print(f"File not found for URL_ID: {url_id}")
        output_rows.append([url_id, url] + [0]*13)

df_output = pd.DataFrame(output_rows, columns=output_columns)

# Save as Excel
df_output.to_excel("Output Data Structure.xlsx", index=False)


In [None]:
from google.colab import files
files.download("Output Data Structure.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>