<a href="https://colab.research.google.com/github/Sonia17101994/Text-Analysis-using-NLP/blob/main/Blackcoffer_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Blackcoffer Text Analysis**

#Objective:
Extract textual data from articles at the given URL and perform text analysis to compute the specified variables.


# Data Analysis



In [1]:
# Importing BeautifulSoup from the bs4 package for parsing HTML
from bs4 import BeautifulSoup
# Importing the requests library for making HTTP requests to retrieve web pages
import requests

# Install the NLTK library
!pip install nltk
# Import the NLTK library to use its functions and datasets
import nltk
# Download the Punkt tokenizer models
nltk.download('punkt')

import re
# Download the CMU Pronouncing Dictionary if you haven't already
from nltk.corpus import cmudict

# Import the function for tokenizing text into sentences from the NLTK library
from nltk.tokenize import sent_tokenize
# Import the function for tokenizing text into words from the NLTK library
from nltk.tokenize import word_tokenize

# Import the pandas library for data manipulation
import pandas as pd
import numpy as np


#Initialize an empty list to store stop words
stop_word = []
# List of stop word files
li = ['StopWords_Auditor.txt', 'StopWords_Currencies.txt', 'StopWords_DatesandNumbers.txt', 'StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_Geographic.txt', 'StopWords_Names.txt']
# Read each stop word file
for i in li:
  # Open the file in read mode
  my_file = open(i, "r", encoding="latin-1")
  # Read the file content
  data = my_file.read()
  # Split the content into a list of words
  data_into_list = data.split("\n")
  # Close the file
  my_file.close()

  # Add each stop word to the stop_word list
  for j in data_into_list:
    stop_word.append(j.lower())


# Creating dictionary of Positive and Negative words
MasterDictionary = {}
# List of positive and negative word files
li = ['positive-words.txt', 'negative-words.txt']

# Read each positive and negative word file
for i in li:
  # Open the file in read mode
  my_file = open(i, "r", encoding="latin-1")
  data = my_file.read()
  # Read the file content and split it into a list of words

  data_into_list = data.split("\n")
  # Add the list of words to the MasterDictionary with the file name
  MasterDictionary[i.strip('.txt')] = data_into_list
  my_file.close()


# Creating dictionary of Pronouncing
# Download the NLTK Punkt tokenizer
!python -m nltk.downloader punkt
# Download the CMU Pronouncing Dictionary if you haven't already
nltk.download('cmudict')
# Load the CMU Pronouncing Dictionary
pronouncing_dict = cmudict.dict()

# Creating Dataframe to store variables
Result = pd.DataFrame({'URL_ID':[],
          'URL':[],
          'POSITIVE SCORE':[],
          'NEGATIVE SCORE':[],
          'POLARITY SCORE':[],
          'SUBJECTIVITY SCORE':[],
          'AVG SENTENCE LENGTH':[],
          'PERCENTAGE OF COMPLEX WORDS':[],
          'FOG INDEX':[],
          'AVG NUMBER OF WORDS PER SENTENCE':[],
          'COMPLEX WORD COUNT':[],
          'WORD COUNT':[],
          'SYLLABLE PER COUNT':[],
          'PERSONAL PRONOUNS':[],
          'AVG WORD LENGTH':[]})



def analysis(link):
  # URL of the article to be scraped
  url = link
  # Sending a GET request to the URL to fetch the web page content
  try:
    page = requests.get(url)
    page.raise_for_status()
    # Parse the web page content with BeautifulSoup
    soup = BeautifulSoup(page.content, 'html.parser')
    # Print the parsed HTML in a readable format

    # Get the article title
    if soup.find('h1' ,class_="entry-title"):
      title = soup.find('h1' ,class_="entry-title").text
    elif soup.find('h1' ,class_="tdb-title-text"):
      title = soup.find('h1' ,class_="tdb-title-text").text
    else:
      title = None


    # Get the article content
    if soup.find('div', class_ ="td-post-content tagdiv-type"):
      context = soup.find('div', class_ ="td-post-content tagdiv-type").text
    elif soup.find('div', class_ ="tdb-block-inner td-fix-index"):
      context = soup.find('div', class_ ="tdb-block-inner td-fix-index").text
    else:
      context = None
    # Combine the title and content of the article
    article = title + context


    ## Data Analysis ##

    document = sent_tokenize(article)

    # Sentimental Analysis:
    # Cleaning by using Stop Words Lists to remove the stop words present in the article.
    # Process each sentence in the list
    sentence = []
    for i in range(0, len(document)):
      # Tokenize the sentence into words
      words  = nltk.word_tokenize(document[i])
      # Remove stop words from the list of words
      words = [word for word in words if word.lower() not in set(stop_word) and word.isalnum()]
      # Join the words back into a sentence
      sentence.append(' '.join(words))


    # Tokenize the cleaned sentence into words using NLTK
    cleaned_words = nltk.word_tokenize(' '.join(sentence))
    # Convert the list of cleaned words into a set to remove duplicate words
    cleaned_words = set(cleaned_words)

    # 1-Positive and Negative score
    # Initialize positive and negative scores
    positive_score= 0
    negative_score = 0
    # Iterate over each word in the set of cleaned words
    for word in cleaned_words:
      # Check if the word is in the positive word list
      if word.lower() in MasterDictionary.get('positive-words'):
        # Increment the positive score by 1
        positive_score= positive_score + 1

      # Check if the word is in the negative word list
      elif word.lower() in MasterDictionary.get('negative-words'):
        # Increment the negative score by 1
        negative_score = negative_score + 1

    # 2-Polarity Score
    Polarity_Score = (positive_score - negative_score) / ((positive_score + negative_score) +  0.000001)


    # 3-Subjectivity score
    Subjectivity_Score = (positive_score + negative_score) / ((len(cleaned_words)) + 0.000001)

    # Analysis of Readability:
    # 1-Average_Sentence_Length
    Average_Sentence_Length = round(len(cleaned_words) / len(sentence))

    def syllable_count(word):
        """Count the number of syllables in a word using the CMU Pronouncing Dictionary."""
        if word.lower() in pronouncing_dict:
            return max([len(list(y for y in x if y[-1].isdigit())) for x in pronouncing_dict[word.lower()]])
        else:
            # If word not found in dictionary, return 1 (assumes single syllable)
            return 1

    def is_complex_word(word):
        """Check if a word contains more than two syllables."""
        return syllable_count(word) > 2

    # Filter the words to get complex words
    complex_words = set([word for word in cleaned_words if is_complex_word(word)])


    # 2-Percentage_of_Complex_words
    Percentage_of_Complex_words = len(complex_words) / len(cleaned_words)

    # 3-Fog Index
    Fog_index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)

    # Average Number of Words Per Sentence
    Average_Number_of_Words_Per_Sentence = round(len(cleaned_words) / len(sentence))

    # Complex_Word_Count
    Complex_Word_Count = len(complex_words)

    # Word_Count
    Word_Count = len(cleaned_words)

    # Syllable count per word
    def count_syllables(w):
        # Handling exceptions for words ending with 'es' or 'ed'
        exceptions = ["es", "ed"]
        if word.endswith(tuple(exceptions)):
            return 0

        # Counting syllables by counting vowels
        vowels = 'aeiouAEIOU'
        syllable_count = 0
        prev_char_was_vowel = False

        for char in w:
            if char in vowels:
                # If current character is a vowel and previous wasn't, count a syllable
                if not prev_char_was_vowel:
                    syllable_count += 1
                prev_char_was_vowel = True
            else:
                prev_char_was_vowel = False

        # Handling single 'e' at the end (silent)
        if word.endswith('e') and syllable_count > 1:
            syllable_count -= 1

        # Ensure minimum syllable count is 1
        return max(syllable_count, 1)

    Total_syllable = 0
    for i in cleaned_words:
      syllable = count_syllables(i)
      Total_syllable += syllable


    # Personal_pronouns
    personal_pronouns = 0
    for i in document:
      # Define the personal pronouns to search for
      pronouns = ["I", "we", "my", "ours", "us"]

      # Compile regex pattern to match the pronouns with word boundaries
      pattern = re.compile(r'\b(?:' + '|'.join(pronouns) + r')\b', re.IGNORECASE)

      # Find all matches of the pronouns in the text
      matches = re.findall(pattern, i)

      # Count the total occurrences of personal pronouns
      personal_pronouns += len(matches)

    # Average_Word_Length
    total_number_of_characters= [len(i) for i in cleaned_words]

    Average_Word_Length = sum(total_number_of_characters) / len(cleaned_words)

    # Storing the all final values into dataframe
    Result.loc[len(Result.index)] = [id, link, positive_score, negative_score, Polarity_Score, Subjectivity_Score, Average_Sentence_Length, round(Percentage_of_Complex_words*100.0), Fog_index, Average_Number_of_Words_Per_Sentence, Complex_Word_Count, Word_Count, Total_syllable, personal_pronouns, Average_Word_Length]

  except requests.exceptions.HTTPError as err:
    # Handle HTTP errors by storing NaN values and printing the error
    Result.loc[len(Result.index)] = np.nan
    Result.iloc[len(Result)-1, 0] = id
    Result.iloc[len(Result)-1, 1] = link
    print(f"HTTP error occurred: {err}")
  except requests.exceptions.RequestException as err:
    Result.loc[len(Result.index)] = np.nan
    Result.iloc[len(Result)-1, 0] = id
    Result.iloc[len(Result)-1, 1] = link

# Read input data from Excel file
Input = pd.read_excel('Input.xlsx')
for i in range(0, len(Input)):
  id = Input['URL_ID'][i]
  analysis(Input['URL'][i])

# Download the output file with results
from google.colab import files
Result.to_excel('Output.xlsx')
files.download('Output.xlsx')






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


HTTP error occurred: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
HTTP error occurred: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
#Print the result
Result

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER COUNT,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,15.0,6.0,0.428571,0.090129,3.0,34.0,1.335622,3.0,79.0,233.0,547.0,12.0,7.017167
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,39.0,24.0,0.238095,0.138462,6.0,45.0,2.579341,6.0,204.0,455.0,1193.0,6.0,7.668132
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,27.0,16.0,0.255814,0.129129,6.0,52.0,2.607808,6.0,173.0,333.0,0.0,13.0,8.267267
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,28.0,65.0,-0.397849,0.227941,8.0,46.0,3.384314,8.0,188.0,408.0,773.0,5.0,8.245098
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,18.0,8.0,0.384615,0.107438,6.0,44.0,2.576860,6.0,107.0,242.0,619.0,6.0,7.623967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,18.0,37.0,-0.345455,0.157143,7.0,44.0,2.974857,7.0,153.0,350.0,890.0,4.0,7.505714
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,14.0,27.0,-0.317073,0.170124,6.0,32.0,2.529461,6.0,78.0,241.0,547.0,7.0,6.817427
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,5.0,3.0,0.250000,0.055172,12.0,40.0,4.960000,12.0,58.0,145.0,0.0,0.0,7.420690
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,0.0,0.0,0.000000,0.000000,3.0,67.0,1.466667,3.0,2.0,3.0,9.0,0.0,9.000000
