<a href="https://colab.research.google.com/github/NithyaPKiran/Blackcoffer-data-scientist-assignments/blob/main/Assignment_1_Extraction_of_textual_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Install and check syllapy
!pip install syllapy
!pip show syllapy

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import syllapy

Name: syllapy
Version: 0.7.2
Summary: Calculate syllable counts for English words.
Home-page: https://github.com/mholtzscher/syllapy
Author: Michael Holtzscher
Author-email: michael.holtzscher@gmail.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


In [15]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Function to extract article text from a URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Assuming article is contained in <article> tag, adjust based on specific HTML structure
            article = soup.find('article')
            if article:
                # Extract title
                article_title = article.find('h1').get_text().strip()

                # Extract text
                paragraphs = article.find_all('p')
                article_text = ' '.join([p.get_text().strip() for p in paragraphs])

                return article_title, article_text
            else:
                print(f"No article content found for {url}")
                return None, None
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Error accessing {url}: {e}")
        return None, None

In [17]:
# Function to calculate positive score
def positive_score(text):
    words = word_tokenize(text)
    return sum(1 for word in words if word.lower() in positive_words)

# Function to calculate negative score
def negative_score(text):
    words = word_tokenize(text)
    return sum(1 for word in words if word.lower() in negative_words)

# Function to calculate average sentence length
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

# Function to calculate percentage of complex words
def percentage_complex_words(text):
    words = word_tokenize(text)
    english_stopwords = set(stopwords.words("english"))  # Load stopwords correctly

    # Filter out complex words
    complex_words = [word for word in words if word.lower() not in english_stopwords and len(word) > 6]

    # Calculate percentage
    if len(words) > 0:
        return (len(complex_words) / len(words)) * 100
    else:
        return 0.0  # Return 0 if no words are present in the text

# Function to calculate FOG index
def fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to calculate average number of words per sentence
def avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    words_per_sentence = [len(word_tokenize(sentence)) for sentence in sentences]
    return sum(words_per_sentence) / len(sentences)

# Function to count personal pronouns
def personal_pronouns(text):
    pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves']
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in pronouns)

# Function to calculate average word length
def avg_word_length(text):
    words = word_tokenize(text)
    return sum(len(word) for word in words) / len(words)

# Function to compute syllables per word
def syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(syllapy.count(word) for word in words)
    return syllable_count / len(words) if words else 0

# Function to perform sentiment analysis (using TextBlob)
def sentiment_analysis(text):
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    return polarity_score, subjectivity_score

In [18]:
# Load Input.xlsx
input_df = pd.read_excel('Input.xlsx')

In [19]:
# Create a directory to store extracted texts
output_dir = 'extracted_texts'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [20]:
# Initialize an empty list to store analysis results
results = []

In [21]:
# Iterate through input_df and extract article text
for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    article_title, article_text = extract_article_text(url)

    if article_text:
        # Save article text to a text file
        file_path = os.path.join(output_dir, f"{url_id}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f"{article_title}\n\n{article_text}")

        print(f"Extracted and saved: {file_path}")

        # Perform text analysis
        avg_sentence_len = avg_sentence_length(article_text)
        percent_complex_words = percentage_complex_words(article_text)
        fog_index_value = fog_index(avg_sentence_len, percent_complex_words)
        avg_words_per_sentence_value = avg_words_per_sentence(article_text)
        complex_word_count = len([word for word in word_tokenize(article_text) if len(word) > 6])
        word_count = len(word_tokenize(article_text))
        syllables_per_word_value = syllables_per_word(article_text)
        personal_pronouns_count = personal_pronouns(article_text)
        avg_word_length_value = avg_word_length(article_text)
        polarity_score, subjectivity_score = sentiment_analysis(article_text)

        # Print or save these variables as required
        print(f"URL_ID: {url_id}")
        print(f"AVG SENTENCE LENGTH: {avg_sentence_len}")
        print(f"PERCENTAGE OF COMPLEX WORDS: {percent_complex_words}")
        print(f"FOG INDEX: {fog_index_value}")
        print(f"AVG NUMBER OF WORDS PER SENTENCE: {avg_words_per_sentence_value}")
        print(f"COMPLEX WORD COUNT: {complex_word_count}")
        print(f"WORD COUNT: {word_count}")
        print(f"SYLLABLE PER WORD: {syllables_per_word_value}")
        print(f"PERSONAL PRONOUNS: {personal_pronouns_count}")
        print(f"AVG WORD LENGTH: {avg_word_length_value}")
        print(f"POLARITY SCORE: {polarity_score}")
        print(f"SUBJECTIVITY SCORE: {subjectivity_score}")
        print("-----------------------")

        # Append results to the list
        results.append([
            url_id, avg_sentence_len, percent_complex_words, fog_index_value, avg_words_per_sentence_value,
            complex_word_count, word_count, syllables_per_word_value, personal_pronouns_count, avg_word_length_value,
            polarity_score, subjectivity_score
        ])

Extracted and saved: extracted_texts/blackassign0001.txt
URL_ID: blackassign0001
AVG SENTENCE LENGTH: 15.541666666666666
PERCENTAGE OF COMPLEX WORDS: 16.621983914209114
FOG INDEX: 12.865460232350314
AVG NUMBER OF WORDS PER SENTENCE: 15.541666666666666
COMPLEX WORD COUNT: 62
WORD COUNT: 373
SYLLABLE PER WORD: 1.4101876675603218
PERSONAL PRONOUNS: 4
AVG WORD LENGTH: 4.193029490616622
POLARITY SCORE: 0.2435176892073444
SUBJECTIVITY SCORE: 0.583195253022839
-----------------------
Extracted and saved: extracted_texts/blackassign0002.txt
URL_ID: blackassign0002
AVG SENTENCE LENGTH: 20.90909090909091
PERCENTAGE OF COMPLEX WORDS: 27.888198757763977
FOG INDEX: 19.518915866741956
AVG NUMBER OF WORDS PER SENTENCE: 20.90909090909091
COMPLEX WORD COUNT: 457
WORD COUNT: 1610
SYLLABLE PER WORD: 1.6273291925465838
PERSONAL PRONOUNS: 10
AVG WORD LENGTH: 4.8826086956521735
POLARITY SCORE: 0.12183670356489147
SUBJECTIVITY SCORE: 0.43121502658415417
-----------------------
Extracted and saved: extracted_

In [22]:
# Create a DataFrame to store results and save to an Excel file
columns = [
    'URL_ID', 'AVG_SENTENCE_LENGTH', 'PERCENTAGE_OF_COMPLEX_WORDS', 'FOG_INDEX', 'AVG_NUMBER_OF_WORDS_PER_SENTENCE',
    'COMPLEX_WORD_COUNT', 'WORD_COUNT', 'SYLLABLE_PER_WORD', 'PERSONAL_PRONOUNS', 'AVG_WORD_LENGTH',
    'POLARITY_SCORE', 'SUBJECTIVITY_SCORE'
]

In [23]:
results_df = pd.DataFrame(results, columns=columns)
results_df.to_excel('Assignment 1 - Output_Data_Structure.xlsx', index=False)
print("Results saved to Assignment 1 - Output_Data_Structure.xlsx")


Results saved to Assignment 1 - Output_Data_Structure.xlsx
