In [1]:
import os

In [2]:
%pwd

'c:\\Users\\USER\\Documents\\JupyterNB\\ineuron\\Python Advanced\\Sentiment_Analysis_Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\USER\\Documents\\JupyterNB\\ineuron\\Python Advanced\\Sentiment_Analysis_Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataProcessingConfig:
    root_dir: Path
    word2vec_modl_file: Path
    final_data_file: Path

In [6]:
from src.sentimentAnalysis.constants import *
from src.sentimentAnalysis.utils.common_functionality import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_processing_config(self) -> DataProcessingConfig:
        config = self.config.data_processing

        create_directories([config.root_dir])

        data_processing_config = DataProcessingConfig(
            root_dir=config.root_dir,
            
            word2vec_modl_file=config.word2vec_modl_file,
            final_data_file = config.final_data_file
            
        )

        return data_processing_config

In [8]:
from src.sentimentAnalysis.utils.common_functionality import get_size, save_object, load_object, sent_to_vector, text_preprocessing
from src.sentimentAnalysis import logger
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import numpy as np


In [9]:
class DataProcessing:
    def __init__(self, config: DataProcessingConfig):
        self.config = config


    
    def process_text(self,data_path):
        if not os.path.exists(self.config.final_data_file):

            
            df = pd.read_csv(data_path)
            df = df.dropna(axis=0)

            df = df.drop_duplicates()
            word2vec_model = load_object(self.config.word2vec_modl_file)

            df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
            df['review'] = df['review'].apply(lambda x: text_preprocessing(x))
            df['review'] = df['review'].apply(lambda x: simple_preprocess(x))
            df['vec'] = df['review'].apply(lambda x: sent_to_vector(word2vec_model,x))


            X = np.array(df['vec'].tolist())
            y = np.array(df['sentiment'].tolist())

            final_data_dict = {
                'X':X,
                'y':y
            }
            logger.info("Final data created")

            save_object(self.config.final_data_file,final_data_dict)
            logger.info("Final data saved")

        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.final_data_file))}")




    def train_word2vec_model(self,data_path):

        if not os.path.exists(self.config.word2vec_modl_file):

            df = pd.read_csv(data_path)
            df = df.dropna(axis=0)

            df = df.drop_duplicates()

            reviewText = df['review'].apply(lambda x: simple_preprocess(x))
            model = gensim.models.Word2Vec(
                window = 10,
                min_count = 2,
                workers = 4
            )

            model.build_vocab(reviewText, progress_per = 100)
            model.train(reviewText, total_examples = model.corpus_count, epochs = model.epochs)
            logger.info("word2vec model training completed")

            save_object(self.config.word2vec_modl_file, model)
            logger.info("Word2Vec model is saved")

        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.word2vec_modl_file))}")


In [10]:
try:
    config = ConfigurationManager()
    
    data_processing_config = config.get_data_processing_config()
    data_processing = DataProcessing(config=data_processing_config)
    data_processing.train_word2vec_model(config.config.data_ingestion.local_data_file)
    data_processing.process_text(config.config.data_ingestion.local_data_file)
   
except Exception as e:
    raise e

[2023-07-12 13:08:10,326: INFO: common_functionality: yaml file: config\config.yaml loaded successfully]
[2023-07-12 13:08:10,330: INFO: common_functionality: yaml file: params.yaml loaded successfully]
[2023-07-12 13:08:10,333: INFO: common_functionality: created directory at: artifacts]
[2023-07-12 13:08:10,335: INFO: common_functionality: created directory at: artifacts/data_processing]
[2023-07-12 13:08:29,886: INFO: utils: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-07-12T13:08:29.886183', 'gensim': '4.3.1', 'python': '3.8.17 (default, Jul  5 2023, 20:44:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}]
[2023-07-12 13:08:29,888: INFO: word2vec: collecting all words and their counts]
[2023-07-12 13:08:29,889: INFO: word2vec: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types]
[2023-07-12 13:08:29,894: INFO: word2vec: PROGRESS: at sentence #100, processed 22370 w