<a href="https://colab.research.google.com/github/Savisolanki/Text-Analysis-Using-NLP/blob/main/Text_Analysis_BlackCoffer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import cmudict

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('cmudict')


input_path = "/content/Input.xlsx"
input_df = pd.read_excel(input_path)


def get_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_text = ""
    for paragraph in soup.find_all('p'):
        article_text += paragraph.get_text() + "\n"
    return article_text

def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores['pos'], sentiment_scores['neg'], sentiment_scores['compound'], sentiment_scores['neu']

def calculate_complexity(text):

    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    words = word_tokenize(text)
    num_words = len(words)
    avg_sentence_length = num_words / num_sentences
    complex_words = [word for word in words if len(word) > 6]
    percentage_complex_words = (len(complex_words) / num_words) * 100
    return avg_sentence_length, percentage_complex_words

def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return fog_index

def calculate_avg_number_of_words_per_sentence(text):

    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    words = word_tokenize(text)
    num_words = len(words)
    avg_words_per_sentence = num_words / num_sentences
    return avg_words_per_sentence

def calculate_complex_word_count(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 6]
    complex_word_count = len(complex_words)
    return complex_word_count

def calculate_syllables_per_word(text):
    d = cmudict.dict()
    words = word_tokenize(text)
    syllables = []
    for word in words:
        if word.lower() in d:
            word_syllables = [len([ph for ph in phs if ph[-1].isdigit()]) for phs in d[word.lower()]]
            max_syllables = max(word_syllables)
            syllables.append(max_syllables)
    avg_syllables_per_word = sum(syllables) / len(syllables)
    return avg_syllables_per_word

def calculate_personal_pronouns(text):
    pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']
    words = word_tokenize(text.lower())
    pronoun_count = sum(1 for word in words if word in pronouns)
    return pronoun_count

def calculate_avg_word_length(text):
    words = word_tokenize(text)
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / len(words)
    return avg_word_length

results = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    article_text = get_article_text(url)
    positive_score, negative_score, polarity_score, subjectivity_score = analyze_sentiment(article_text)
    avg_sentence_length, percentage_complex_words = calculate_complexity(article_text)
    fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
    avg_words_per_sentence = calculate_avg_number_of_words_per_sentence(article_text)
    complex_word_count = calculate_complex_word_count(article_text)
    word_count = len(word_tokenize(article_text))
    syllable_per_word = calculate_syllables_per_word(article_text)
    personal_pronouns = calculate_personal_pronouns(article_text)
    avg_word_length = calculate_avg_word_length(article_text)
    results.append([url_id, url, positive_score, negative_score, polarity_score, subjectivity_score,
                    avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence,
                    complex_word_count, word_count, syllable_per_word, personal_pronouns, avg_word_length])


columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
           'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
           'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
           'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
output_df = pd.DataFrame(results, columns=columns)


output_csv_path = "output.csv"
output_df.to_csv(output_csv_path, index=False)


output_df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,0.126,0.005,0.9957,0.869,20.896552,24.257426,18.061591,20.896552,147,606,1.809717,27,4.731023
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,0.144,0.035,0.9995,0.821,22.47561,29.408573,20.753673,22.47561,542,1843,1.898448,38,4.972328
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,0.116,0.046,0.998,0.838,23.590164,37.387074,24.390895,23.590164,538,1439,2.097234,35,5.502432
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,0.103,0.128,-0.9906,0.769,25.625,37.003484,25.051394,25.625,531,1435,1.98532,27,5.358188
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,0.103,0.006,0.9969,0.892,22.906977,32.791878,22.279542,22.906977,323,985,1.874396,28,5.193909
