In [1]:
import os

In [2]:
%pwd

'd:\\projects\\Data-Extraction-and-NLP\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\projects\\Data-Extraction-and-NLP'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class PostCleaningConfig:
    root_dir: Path
    positive_file_path: Path
    negative_file_path: Path

In [6]:
from src.DataExtractionAndNLP.constants import *
from src.DataExtractionAndNLP.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])


    
    def get_post_cleaning_config(self) -> PostCleaningConfig:
        config = self.config.post_cleaning

        post_cleaning_config = PostCleaningConfig(
            root_dir=config.root_dir,
            positive_file_path=config.positive_file_path,
            negative_file_path=config.negative_file_path
        )

        return post_cleaning_config

In [8]:
from src.DataExtractionAndNLP.utils.common import get_name
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

class PostCleaning:
    def __init__(self, config: PostCleaningConfig):
        self.config = config

    def create_sets(self):
        # Read positive and negative words from your .txt files
        def read_words_from_file(file_path):
            with open(file_path, 'r', encoding='latin-1') as file:
                return [word.strip() for word in file.readlines()]

        # Read words from the files
        positive_words = read_words_from_file(self.config.positive_file_path)
        negative_words = read_words_from_file(self.config.negative_file_path)

        # Create sets for positive and negative words
        positive_set = set(positive_words)
        negative_set = set(negative_words)

        return positive_set, negative_set



    def calculate(self, metrics, data, positive_set, negative_set):
        filename = get_name(data)
        output_file = os.path.join(self.config.root_dir, filename)

        with open(output_file, 'r', encoding='utf-8') as file:
            cleaned_text = file.read()

            # Tokenize the cleaned text
            tokens = nltk.word_tokenize(cleaned_text)

            # Calculate positive and negative scores
            positive_score = sum(1 for word in tokens if word.lower() in positive_set)
            negative_score = sum(-1 for word in tokens if word.lower() in negative_set) * -1

            # Calculate polarity and subjectivity scores
            total_words = len(tokens)
            polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
            subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

            # Calculating Word Count
            cleaned_words_list = [word for word in tokens]
            word_count = len(cleaned_words_list)

            # Calculating Percentage of Complex Words
            complex_word_count = metrics['complex_word_count']
            percentage_complex_words = complex_word_count / word_count

            # Calculating Fog Index
            avg_sentence_length = metrics['avg_sentence_length']
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

            # Calculating Average Words Per Sentence
            avg_words_per_sentence = word_count / len(sent_tokenize(cleaned_text))

            metrics.update({"positive_score": positive_score, "negative_score": negative_score, "polarity_score": polarity_score, "subjectivity_score": subjectivity_score, "word_count": word_count, "percentage_complex_words": percentage_complex_words, "fog_index": fog_index, "avg_words_per_sentence": avg_words_per_sentence})

            return metrics

In [9]:
data=['https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/', 'div', 'td-post-content tagdiv-type']

data[2]=data[2].replace(" ", ".")

metrics = {'avg_sentence_length': 17.2090395480226, 'complex_word_count': 323, 'syllable_count': 5681, 'personal_pronouns_count': 2, 'avg_word_length': 5.573539067629678}

In [10]:
try:
    config = ConfigurationManager()
    post_cleaning_config = config.get_post_cleaning_config()
    post_cleaning = PostCleaning(config=post_cleaning_config)
    positive_set, negative_set = post_cleaning.create_sets()
    metrics = post_cleaning.calculate(metrics, data, positive_set, negative_set)
except Exception as e:
    raise e

[2024-08-02 12:07:09,388: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-02 12:07:09,390: INFO: common: created directory at: artifacts]


In [11]:
metrics

{'avg_sentence_length': 17.2090395480226,
 'complex_word_count': 323,
 'syllable_count': 5681,
 'personal_pronouns_count': 2,
 'avg_word_length': 5.573539067629678,
 'positive_score': 138,
 'negative_score': 45,
 'polarity_score': 0.5081967185344441,
 'subjectivity_score': 0.10077092505464155,
 'word_count': 1816,
 'percentage_complex_words': 0.177863436123348,
 'fog_index': 6.9547611936583795,
 'avg_words_per_sentence': 1816.0}