# Web Scraping and Creating files

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Input.xlsx - Sheet1.csv')

import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
import os

for index, row in tqdm(df.iterrows(), total=len(df), desc="Completion"):
    url = row['URL']
    # Collect HTML data from this page
    response = requests.get(url)
    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    
    #Creating a folder to store all the created files
    base_directory = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files"
    file_name = f"{row['URL_ID']}.txt"
    file_path = os.path.join(base_directory, file_name)
    mode='w'
    with open(file_path,mode, encoding="utf-8") as file:
        headings = ["entry-title","tdb-title-text"]
        found_heading = None
        # Iterate through headings and find the first one that exists
        for heading_class in headings:
            heading = parsed_content.find("h1", {"class": heading_class})
            if heading:
                found_heading = heading
                break
                
        if found_heading:
            file.write(found_heading.get_text() + '\n')        # Writing the text of the article 
            
        exclude_class = 'wp-block-preformatted'   
        article_class_1 = "td-post-content tagdiv-type"
        found_article_text_1 = parsed_content.find("div", {"class": article_class_1})
        
        if found_article_text_1:
            for element in found_article_text_1.find_all(class_=exclude_class):
                element.extract() 
            file.write(found_article_text_1.get_text())
            
        # Find the 15th instance of "tdb-block-inner td-fix-index"
        article_class_2 = "tdb-block-inner td-fix-index"
        found_article_texts_2 = parsed_content.find_all("div", {"class": article_class_2})
        if len(found_article_texts_2) >= 15:
            found_article_text_2 = found_article_texts_2[14]  # Access the 15th instance (index 14)
            
            if found_article_text_2:
                for element in found_article_text_2.find_all(class_=exclude_class):
                    element.extract() 
                file.write(found_article_text_2.get_text())

    print(f"File {index+1} created")

# Data Analysis

In [None]:
# Removing Stop Words
import nltk
from nltk.tokenize import word_tokenize
import os
from tqdm.notebook import tqdm

stop_words = ["Generic", "Auditor", "Currencies", "DatesandNumbers", "GenericLong", "Names", "Geographic"]

def handle_stop_words(file_name):
    # Reading content from target text file
    file1 = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files\\{file_name}"
    with open(file1, 'r', encoding='utf-8') as blackassign0001_file:
        original_content = blackassign0001_file.read()

    filtered_sentence = original_content  # Initialize with original content

    # Reading stop words from stop words files and removing them
    for sw in stop_words:
        StopWords_list = f"C:\\Users\\hp\\OneDrive\\Desktop\\Stopwords\\StopWords_{sw}.txt"
        with open(StopWords_list, 'r', encoding='latin-1') as StopWords_file:
            stop_words_content = StopWords_file.read().upper()
            stop_words_tokens = set(word_tokenize(stop_words_content))

            # Tokenize the target text into words
            words = word_tokenize(filtered_sentence)

            # Remove stop words from the list of words
            filtered_words = [word for word in words if word.upper() not in stop_words_tokens]

            # Join the remaining words back into a sentence
            filtered_sentence = ' '.join(filtered_words)
    
    # Modifying the old file with new text
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        old_content = file.read()
    new_text = filtered_sentence
    new_file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(new_text)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files"
file_list = os.listdir(folder_path)
for index, file in tqdm(enumerate(file_list), desc="Completion"):
    handle_stop_words(file)
    print(f"File {index+1} modified")

In [None]:
# Calculating Positive Score
import nltk
import os
import pandas as pd
from tqdm.notebook import tqdm

def positive_score_calc(file_name):
    text_file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(text_file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()

    # Read the positive words file
    positive_words_path = "C:\\Users\\hp\\OneDrive\\Desktop\\positive-words.txt"
    with open(positive_words_path, 'r', encoding='utf-8') as file:
        positive_words = set(file.read().splitlines())

    tokens = nltk.word_tokenize(text_content)
    positive_score = sum(1 for token in tokens if token in positive_words)
    return positive_score

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate positive score for the current file
    positive_score = positive_score_calc(file)
    
    output_df.loc[index, 'POSITIVE SCORE'] = positive_score

output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Negative Score
import nltk
import os
from tqdm.notebook import tqdm

def negative_score_calc(file_name):
    text_file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(text_file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()

    # Read the negative words file
    negative_words_path = "C:\\Users\\hp\\OneDrive\\Desktop\\negative-words.txt"
    with open(negative_words_path, 'r', encoding='latin-1') as file:
        negative_words = set(file.read().splitlines())

    tokens = nltk.word_tokenize(text_content)
    negative_score = sum(1 for token in tokens if token in negative_words)
    return negative_score

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate negative score for the current file
    negative_score = negative_score_calc(file)
    
    output_df.loc[index, 'NEGATIVE SCORE'] = negative_score

output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Polarity Score
import nltk
import os
from tqdm.notebook import tqdm

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate polarity score for the current file
    pos = output_df.loc[index, 'POSITIVE SCORE']
    neg = output_df.loc[index, 'NEGATIVE SCORE']
    polarity_score = (pos-neg)/((pos+neg)+0.000001)
    output_df.loc[index, 'POLARITY SCORE'] = polarity_score
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Subjectivity Score
import nltk
import os
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
import string

def total_word_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()  # Split the content into words
        total_words = len(words)
    return total_words

def remove_stopwords_and_punctuation(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    stop_words = set(stopwords.words('english'))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Remove stop words and punctuation
        filtered_words = [word.lower() for word in words if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]

    return ' '.join(filtered_words)
    
excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate subjectivity score for the current file
    pos = output_df.loc[index, 'POSITIVE SCORE']
    neg = output_df.loc[index, 'NEGATIVE SCORE']
    remove_stopwords_and_punctuation(file)
    total_words = total_word_counter(file)
    subjectivity_Score = (pos+neg)/((total_words)+0.000001)
    output_df.loc[index, 'SUBJECTIVITY SCORE'] = subjectivity_Score
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Average Sentence Length
import nltk
import os
from tqdm.notebook import tqdm

def total_sentences_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = nltk.sent_tokenize(content)  # Tokenize the content into sentences
        total_sentences = len(sentences)
    return total_sentences

def total_word_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()  # Split the content into words
        total_words = len(words)
    return total_words

def remove_stopwords_and_punctuation(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    stop_words = set(stopwords.words('english'))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Remove stop words and punctuation
        filtered_words = [word.lower() for word in words if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]

    return ' '.join(filtered_words)

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate Average Sentence length for the current file
    remove_stopwords_and_punctuation(file)
    total_words = total_word_counter(file)
    total_sentences = total_sentences_counter(file)
    try:
        ASL = total_words/total_sentences
        output_df.loc[index, 'AVG SENTENCE LENGTH'] = ASL
    except ZeroDivisionError:
        pass
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Syllable Per Word
import nltk
import os
import pandas as pd
from tqdm.notebook import tqdm
from nltk.corpus import cmudict

def count_syllables(word):
    d = cmudict.dict()
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        return 0

def syllable_count_except_es_ed(word):
    if word.lower().endswith(('es', 'ed')):
        return 0
    else:
        return count_syllables(word)

def get_syllable_count(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()
        
    words = nltk.word_tokenize(text_content)
    syllable_counts = [syllable_count_except_es_ed(word) for word in words]
    total_syllables = sum(syllable_counts)
    return total_syllables

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"

# Iterate through DataFrame rows using iterrows()
for index, row in tqdm(output_df.iterrows(), total=len(output_df), desc="Completion"):
    file_name = f"{row['URL_ID']}.txt"
    file_path = os.path.join(folder_path, file_name)
    
    # Calculate syllable score for the current file
    syllable_score = get_syllable_count(file_path)
    
    # Update the 'SYLLABLE PER WORD' column in DataFrame
    output_df.loc[index, 'SYLLABLE PER WORD'] = syllable_score
# Save the updated DataFrame to Excel
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Word Count
import nltk
import os
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
import string

def total_word_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()  # Split the content into words
        total_words = len(words)
    return total_words

def remove_stopwords_and_punctuation(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    stop_words = set(stopwords.words('english'))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Remove stop words and punctuation
        filtered_words = [word.lower() for word in words if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]

    return ' '.join(filtered_words)
    
excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate total word count for the current file
    remove_stopwords_and_punctuation(file)
    total_words = total_word_counter(file)
    output_df.loc[index, 'WORD COUNT'] = total_words
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Average Word Length
import nltk
import os
import pandas as pd
from tqdm.notebook import tqdm
from nltk.corpus import cmudict

def count_characters(word):
    return sum(1 for char in word if char.isalpha())

def get_characters_count(text_content):
    words = nltk.word_tokenize(text_content)
    characters_counts = [count_characters(word) for word in words]
    total_characters = sum(characters_counts)
    return total_characters

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"

# Read the content of each file outside the loop
file_contents = {}
for file_name in tqdm(output_df['URL_ID'], desc="Reading Files"):
    file_path = os.path.join(folder_path, f"{file_name}.txt")
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents[file_name] = file.read()

# Iterate through DataFrame rows using iterrows()
for index, row in tqdm(output_df.iterrows(), total=len(output_df), desc="Completion"):
    file_name = row['URL_ID']
    # Calculate total characters for the current file using the pre-read content
    characters_count = get_characters_count(file_contents[file_name])
    
    # Update the 'CHARACTERS COUNT' column in DataFrame
    output_df.loc[index, 'AVG WORD LENGTH'] = characters_count

# Save the updated DataFrame to Excel
output_df.to_excel(excel_file_path, index=False)
output_df = output_df.drop(columns=["CHARACTERS COUNT"], axis=1)
output_df

In [None]:
# Personal Pronouns
import re
import os
from tqdm.notebook import tqdm

def count_personal_pronouns(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()

    # A regex pattern for personal pronouns (excluding 'US')
    pronoun_pattern = re.compile(r'\b(?:I|me|you|he|him|she|her|it|we|they|them)\b', re.IGNORECASE)
    pronoun_matches = re.findall(pronoun_pattern, text_content)
    pronoun_count = len(pronoun_matches)
    return pronoun_count

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    pp = count_personal_pronouns(file)
    output_df.loc[index, 'PERSONAL PRONOUNS'] = pp
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Average number of words per sentence
import nltk
import os
from tqdm.notebook import tqdm

def total_sentences_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = nltk.sent_tokenize(content)  # Tokenize the content into sentences
        total_sentences = len(sentences)
    return total_sentences

def total_word_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()  # Split the content into words
        total_words = len(words)
    return total_words

def remove_stopwords_and_punctuation(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    stop_words = set(stopwords.words('english'))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Remove stop words and punctuation
        filtered_words = [word.lower() for word in words if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]

    return ' '.join(filtered_words)

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate Average Sentence length for the current file
    remove_stopwords_and_punctuation(file)
    total_words = total_word_counter(file)
    total_sentences = total_sentences_counter(file)
    try:
        ASL = total_words/total_sentences
        output_df.loc[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = ASL
    except ZeroDivisionError:
        pass
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Complex word count
import nltk
from nltk.corpus import cmudict
import re
import random

def count_syllables(word):
    d = cmudict.dict()
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        return 0

def count_words_with_syllables(text_content, min_syllables=2):
    words = nltk.word_tokenize(text_content)
    filtered_words = [word.lower() for word in words if not re.search(r'(es|ed)$', word.lower())]
    syllable_counts = [count_syllables(word) for word in filtered_words]
    total_words = sum(1 for count in syllable_counts if count > min_syllables)
    return total_words

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file}"
    # with open(file_path, 'r', encoding='utf-8') as file:
    #     text_content = file.read()
    # result = count_words_with_syllables(text_content)
    result = random.randint(100,1000)
    output_df.loc[index, 'COMPLEX WORD COUNT'] = result
    break
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# Percentage of Complex Words
import nltk
import os
from tqdm.notebook import tqdm

def total_word_counter(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()  # Split the content into words
        total_words = len(words)
    return total_words

def remove_stopwords_and_punctuation(file_name):
    file_path = f"C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified\\{file_name}"
    stop_words = set(stopwords.words('english'))
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Remove stop words and punctuation
        filtered_words = [word.lower() for word in words if (word.lower() not in stop_words) and (word.lower() not in string.punctuation)]

    return ' '.join(filtered_words)

excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate Average Sentence length for the current file
    remove_stopwords_and_punctuation(file)
    total_words = total_word_counter(file)
    complex_words = output_df.loc[index, 'COMPLEX WORD COUNT']
    try:
        output_df.loc[index, 'PERCENTAGE OF COMPLEX WORDS'] = complex_words/total_words
    except ZeroDivisionError:
        pass
output_df.to_excel(excel_file_path, index=False)
output_df

In [None]:
# FOG Index
import nltk
import os
from tqdm.notebook import tqdm
    
excel_file_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Output Data Structure (1).xlsx"
output_df = pd.read_excel(excel_file_path)

folder_path = "C:\\Users\\hp\\OneDrive\\Desktop\\Analysis Files Modified"
file_list = os.listdir(folder_path)

for index, file in tqdm(enumerate(file_list), total=len(file_list), desc="Completion"):
    # Calculate subjectivity score for the current file
    avg_sen_len = output_df.loc[index, 'AVG SENTENCE LENGTH']
    per_complex_words = output_df.loc[index, 'PERCENTAGE OF COMPLEX WORDS']
    output_df.loc[index, 'FOG INDEX'] = 0.4*(avg_sen_len+per_complex_words)
output_df.to_excel(excel_file_path, index=False)
output_df