<a href="https://colab.research.google.com/github/NithyaPKiran/Blackcoffer-data-scientist-assignments/blob/main/Text_extraction_and_analysis_Blackcoffer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
# Installing and checking syllapy
!pip install syllapy
!pip show syllapy
import syllapy

Name: syllapy
Version: 0.7.2
Summary: Calculate syllable counts for English words.
Home-page: https://github.com/mholtzscher/syllapy
Author: Michael Holtzscher
Author-email: michael.holtzscher@gmail.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


In [45]:
#importing the libraries
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

In [46]:
#downloading nltk resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
# Define function to load stop words from files
def load_words(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            words = set(file.read().splitlines())
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            words = set(file.read().splitlines())
    return words

In [48]:
# Function to extract article text from a URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Assuming article is contained in <article> tag, adjust based on specific HTML structure
            article = soup.find('article')
            if article:
                # Extract title
                article_title = article.find('h1').get_text().strip()

                # Extract text
                paragraphs = article.find_all('p')
                article_text = ' '.join([p.get_text().strip() for p in paragraphs])

                return article_title, article_text
            else:
                print(f"No article content found for {url}")
                return None, None
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Error accessing {url}: {e}")
        return None, None

In [49]:
# Function to clean text using stopwords
def clean_text(text, stopwords_set):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stopwords_set and word.isalpha()]
    return ' '.join(cleaned_words)

In [50]:
# Function to calculate positive score
def positive_score(text, positive_words):
    words = word_tokenize(text)
    return sum(1 for word in words if word.lower() in positive_words)

In [51]:
# Function to calculate negative score
def negative_score(text, negative_words):
    words = word_tokenize(text)
    return sum(1 for word in words if word.lower() in negative_words)

In [52]:
# Function to calculate polarity score
def polarity_score(positive_score, negative_score):
    total_score = positive_score + negative_score + 0.000001  # Add small value to avoid division by zero
    return (positive_score - negative_score) / total_score

In [53]:
# Function to calculate subjectivity score
def subjectivity_score(positive_score, negative_score, total_words):
    total_score = total_words + 0.000001  # Add small value to avoid division by zero
    return (positive_score + negative_score) / total_score

In [54]:
# Function to calculate average sentence length
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

In [55]:
# Function to calculate percentage of complex words
def percentage_complex_words(text, stopwords_set):
    words = word_tokenize(text)
    complex_words = [word for word in words if word.lower() not in stopwords_set and len(word) > 6]
    return (len(complex_words) / len(words)) * 100 if len(words) > 0 else 0.0

In [57]:
# Function to calculate FOG index
def fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

In [58]:
# Function to calculate average number of words per sentence
def avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    words_per_sentence = [len(word_tokenize(sentence)) for sentence in sentences]
    return sum(words_per_sentence) / len(sentences)

In [59]:
# Function to count personal pronouns
def personal_pronouns(text):
    pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves']
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in pronouns)

In [60]:
# Function to calculate average word length
def avg_word_length(text):
    words = word_tokenize(text)
    return sum(len(word) for word in words) / len(words)

In [61]:
# Function to compute syllables per word
def syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(syllapy.count(word) for word in words)
    return syllable_count / len(words) if words else 0

In [62]:
# Function to perform sentiment analysis (using TextBlob)
def sentiment_analysis(text):
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    return polarity_score, subjectivity_score

In [63]:
# Function to load stopwords from multiple files into a set
def load_stopwords(stopword_files):
    stopwords_set = set()
    for file in stopword_files:
        stopwords_set.update(load_words(file))
    return stopwords_set

In [64]:
# Paths to stopword files
stopword_files = [
    'StopWords_Auditor.txt',
    'StopWords_Currencies.txt',
    'StopWords_DatesandNumbers.txt',
    'StopWords_Generic.txt',
    'StopWords_GenericLong.txt',
    'StopWords_Geographic.txt',
    'StopWords_Names.txt'
]

In [65]:
# Load stopwords into a set
stopwords_set = load_stopwords(stopword_files)

In [66]:
# Path to positive and negative words files
positive_words_file = 'positive-words.txt'
negative_words_file = 'negative-words.txt'


In [67]:
# Load positive and negative words into lists
positive_words = load_words(positive_words_file)
negative_words = load_words(negative_words_file)

In [68]:
# Load Input.xlsx
input_df = pd.read_excel('Input.xlsx')
input_df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


In [69]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  100 non-null    object
 1   URL     100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [70]:
# Create a directory to store extracted texts
output_dir = 'extracted_texts'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [71]:
# Initialize an empty list to store analysis results
results = []

In [73]:
# Iterate through input_df and extract article text
for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    article_title, article_text = extract_article_text(url)

    if article_text:
        # Save article text to a text file
        file_path = os.path.join(output_dir, f"{url_id}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f"{article_title}\n\n{article_text}")

        print(f"Extracted and saved: {file_path}")

        # Perform text cleaning
        cleaned_text = clean_text(article_text, stopwords_set)

        # Custom sentiment analysis
        pos_score = positive_score(cleaned_text, positive_words)
        neg_score = negative_score(cleaned_text, negative_words)
        pol_score = polarity_score(pos_score, neg_score)
        subj_score = subjectivity_score(pos_score, neg_score, len(word_tokenize(cleaned_text)))

        # TextBlob sentiment analysis
        polarity_textblob, subjectivity_textblob = sentiment_analysis(article_text)

        # Calculate other text analysis metrics
        avg_sent_len = avg_sentence_length(article_text)
        percent_complex = percentage_complex_words(cleaned_text, stopwords_set)
        fog_idx = fog_index(avg_sent_len, percent_complex)
        avg_words_per_sent = avg_words_per_sentence(article_text)
        complex_word_count = sum(1 for word in word_tokenize(cleaned_text) if len(word) > 6)
        word_count = len(word_tokenize(cleaned_text))
        syllables_per_word_count = syllables_per_word(cleaned_text)
        personal_pronouns_count = personal_pronouns(article_text)
        avg_word_len = avg_word_length(cleaned_text)

        # Print or save these variables as required
        print(f"URL_ID: {url_id}")
        print(f"URL: {url}")
        print(f"AVG SENTENCE LENGTH: {avg_sent_len}")
        print(f"PERCENTAGE OF COMPLEX WORDS: {percent_complex}")
        print(f"FOG INDEX: {fog_idx}")
        print(f"AVG NUMBER OF WORDS PER SENTENCE: {avg_words_per_sent}")
        print(f"COMPLEX WORD COUNT: {complex_word_count}")
        print(f"WORD COUNT: {word_count}")
        print(f"SYLLABLE PER WORD: {syllables_per_word_count}")
        print(f"PERSONAL PRONOUNS: {personal_pronouns_count}")
        print(f"AVG WORD LENGTH: {avg_word_len}")
        print(f"POLARITY SCORE (Custom): {pol_score}")
        print(f"SUBJECTIVITY SCORE (Custom): {subj_score}")
        print(f"POLARITY SCORE (TextBlob): {polarity_textblob}")
        print(f"SUBJECTIVITY SCORE (TextBlob): {subjectivity_textblob}")
        print("-----------------------")

        # Append results to the list
        results.append([
            url_id, url, avg_sent_len, percent_complex, fog_idx, avg_words_per_sent,
            complex_word_count, word_count, syllables_per_word_count, personal_pronouns_count, avg_word_len,
            pol_score, subj_score, polarity_textblob, subjectivity_textblob
        ])

# Create a DataFrame to store results and save to an Excel file
columns = [
    'URL_ID', 'URL', 'AVG_SENTENCE_LENGTH', 'PERCENTAGE_OF_COMPLEX_WORDS', 'FOG_INDEX', 'AVG_NUMBER_OF_WORDS_PER_SENTENCE',
    'COMPLEX_WORD_COUNT', 'WORD_COUNT', 'SYLLABLE_PER_WORD', 'PERSONAL_PRONOUNS', 'AVG_WORD_LENGTH',
    'POLARITY_SCORE_CUSTOM', 'SUBJECTIVITY_SCORE_CUSTOM', 'POLARITY_SCORE_TEXTBLOB', 'SUBJECTIVITY_SCORE_TEXTBLOB'
]

results_df = pd.DataFrame(results, columns=columns)
results_df.to_excel('Output_Data_Structure.xlsx', index=False)
print("Output_Data_Structure.xlsx")


Extracted and saved: extracted_texts/blackassign0001.txt
URL_ID: blackassign0001
URL: https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/
AVG SENTENCE LENGTH: 15.541666666666666
PERCENTAGE OF COMPLEX WORDS: 40.52287581699346
FOG INDEX: 22.425816993464053
AVG NUMBER OF WORDS PER SENTENCE: 15.541666666666666
COMPLEX WORD COUNT: 62
WORD COUNT: 153
SYLLABLE PER WORD: 2.156862745098039
PERSONAL PRONOUNS: 4
AVG WORD LENGTH: 6.522875816993464
POLARITY SCORE (Custom): 0.7142856122449125
SUBJECTIVITY SCORE (Custom): 0.04575163368789782
POLARITY SCORE (TextBlob): 0.2435176892073444
SUBJECTIVITY SCORE (TextBlob): 0.583195253022839
-----------------------
Extracted and saved: extracted_texts/blackassign0002.txt
URL_ID: blackassign0002
URL: https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/
AVG SENTENCE LENGTH: 20.90909090909091


In [76]:
results_df = pd.DataFrame(results, columns=columns)
results_df.to_excel('Output_Data_Structure.xlsx', index=False)
print("Data successfully updated in Output_Data_Structure.xlsx")

Data successfully updated in Output_Data_Structure.xlsx
