In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords, cmudict, opinion_lexicon
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
import textblob
from textblob import TextBlob

In [None]:
df = pd.read_csv("text_analysis.csv")

In [None]:
df

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')
nltk.download('opinion_lexicon')

In [None]:
# Initialize NLTK objects
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
# Load positive and negative word lists from the opinion_lexicon
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [None]:
# Function to process text
def process_text(text):
    # Tokenization
    words = word_tokenize(text)
    
    # Remove punctuation
    words = [word for word in words if word not in string.punctuation]
    
    # Remove stopwords
    words = [word for word in words if word.lower() not in stop_words]
    
    # Perform stemming
    words = [stemmer.stem(word) for word in words]
    
    # Join words back to a sentence
    processed_text = ' '.join(words)
    
    return processed_text

In [None]:
# Function to calculate sentiment scores
def calculate_sentiment_scores(processed_text):
    positive_score = len([word for word in processed_text.split() if word in positive_words])
    negative_score = len([word for word in processed_text.split() if word in negative_words])
    
    return positive_score, negative_score

In [None]:
# Function to calculate polarity and subjectivity scores
def calculate_polarity_subjectivity(processed_text):
    blob = TextBlob(processed_text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    return polarity, subjectivity

In [None]:
# Function to calculate readability scores
def calculate_readability_scores(processed_text):
    sentences = sent_tokenize(processed_text)
    words = word_tokenize(processed_text)
    
    average_sentence_length = len(words) / len(sentences)
    
    complex_words_count = len([word for word in words if word in positive_words or word in negative_words])
    percentage_complex_words = complex_words_count / len(words)
    
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    return average_sentence_length, percentage_complex_words, fog_index, complex_words_count

In [None]:
# Function to calculate additional attributes
def calculate_additional_attributes(processed_text):
    words = word_tokenize(processed_text)
    syllable_count = sum([len(list(y for y in x if y[-1].isdigit())) for x in cmudict.words()])
    
    average_words_per_sentence = len(words) / len(sent_tokenize(processed_text))
    word_count = len(words)
    average_syllables_per_word = syllable_count / word_count
    
    personal_pronouns = len([word for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']])
    
    average_word_length = sum(len(word) for word in words) / len(words)
    
    return average_words_per_sentence, word_count, average_syllables_per_word, personal_pronouns, average_word_length

In [None]:
# Function to process URLs
def process_url(url):
    try:
        html = urlopen(url).read()
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.get_text()
        
        processed_text = process_text(text)
        positive_score, negative_score = calculate_sentiment_scores(processed_text)
        polarity, subjectivity = calculate_polarity_subjectivity(processed_text)
        average_sentence_length, percentage_complex_words, fog_index, complex_words_count = calculate_readability_scores(processed_text)
        (average_words_per_sentence, word_count, average_syllables_per_word, personal_pronouns, average_word_length) = calculate_additional_attributes(processed_text)
        
        return processed_text, positive_score, negative_score, polarity, subjectivity, average_sentence_length, percentage_complex_words, fog_index, complex_words_count,average_words_per_sentence, word_count, average_syllables_per_word, personal_pronouns, average_word_length
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None 

In [None]:
# Iterate through URLs and extract text and other Attributes
scores = df['URL'].apply(process_url)
df['Processed_text'] = [score[0] if score is not None else None for score in scores]
df['Positive_score'] = [score[1] if score is not None else None for score in scores]
df['Negative_score'] = [score[2] if score is not None else None for score in scores]
df['polarity'] = [score[3] if score is not None else None for score in scores]
df['subjectivity'] = [score[4] if score is not None else None for score in scores]
df['average_sentence_length'] = [score[5] if score is not None else None for score in scores]
df['percentage_complex_words'] = [score[6] if score is not None else None for score in scores]
df['fog_index'] = [score[7] if score is not None else None for score in scores]
df['complex_words_count'] = [score[8] if score is not None else None for score in scores]
df['average_words_per_sentence'] = [score[9] if score is not None else None for score in scores]
df['word_count'] = [score[10] if score is not None else None for score in scores]
df['average_syllables_per_word'] = [score[11] if score is not None else None for score in scores]
df['personal_pronouns'] = [score[12] if score is not None else None for score in scores]
df['average_word_length'] = [score[13] if score is not None else None for score in scores]

In [None]:
# Display the DataFrame with processed text
df

In [None]:
df.to_csv("text_analysis_result.csv")