In [29]:
import os

In [32]:
# %pwd

In [31]:
# %ls

In [6]:
# import os
os.chdir('../')

In [33]:
%pwd

'/Users/satwik/Downloads/MLproj/airlines_sentiment_classification'

In [52]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    training_data_path : Path
    training_data_file : Path
    training_cleansed_data : Path
    datasets_dir : Path
 



In [61]:
from pathlib import Path

CONFIG_FILE_PATH = Path("/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/config/config.yaml")
PARAMS_FILE_PATH = Path("/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/params.yaml")

In [36]:
# H = Path("/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/params.yaml")

In [54]:
from src.airlinesSentiment.constants import *
from src.airlinesSentiment.utils.common import read_yaml, create_directories

In [82]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.feature_engineering

        create_directories([config.root_dir])

        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=config.root_dir,
            training_data_path=config.training_data_path,
            training_data_file=config.training_data_file,
            training_cleansed_data=config.training_cleansed_data,
            datasets_dir=config.datasets_dir
        )

        return data_preprocessing_config



In [83]:
import os
from airlinesSentiment import logger
from airlinesSentiment.utils.common import get_size
from torch.utils.data import Dataset
import gdown
import spacy
import torchgen
import logging
import torch
import urllib.request as request
from zipfile import ZipFile
import tensorflow as tf
from nltk.corpus import stopwords
from pathlib import Path
import string
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

logger = logging.getLogger(__name__)

        
class DataPreprocessing: 
    def __init__(self, config: DataPreprocessingConfig):

        self.config = config
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(stopwords.words('english'))
        self.mapping_labels = {'negative': 0, 'neutral': 1, 'positive': 2}
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


        self.data = self.load_data()
        self.output_file_path = Path(self.config.training_cleansed_data) / 'cleaned_tweets.csv'
        self.datasets_dir = Path(self.config.datasets_dir)


    def load_data(self) -> pd.DataFrame:
        if not Path(self.config.training_data_file).exists():
            raise FileNotFoundError(f'file not found: {self.config.training_data_file}')
        
        #load the dataset
        data = pd.read_csv(self.config.training_data_file)
        logger.info(f"Dataset loaded from {self.config.training_data_file}")
        return data
    
    def text_process(self) -> None:

        self.data['cleaned_text'] = self.data['text'].apply(self._process_single_text)
        logger.info("Text processing completed")



    def _process_single_text(self, text: str) -> str:

        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))

        doc = self.nlp(text)
        tokens = [token.lemma_ for token in doc if token.text not in self.stop_words]

        return ' '.join(tokens)
    
    def mapping_labels_func(self) -> None:

        self.data['labels'] = self.data['airline_sentiment'].map(self.mapping_labels)
        logger.info("Labels mapped to numerical values")

    def tokenize_text(self) -> None:
        self.data['tokenized'] = self.data['cleaned_text'].apply(
            lambda x: self.tokenizer(x, padding = 'max_length', truncation=True, max_length=128, return_tensors='pt')
        )
    logger.info("Text tokenization completed")

    def save_data(self) -> None:
        self.data.to_csv(self.output_file_path, index=False)
        logger.info(f"Preprocessed data saved to {self.output_file_path}")

    def train_val_test_split(self, test_size: float = 0.3, val_size: float = 0.5, random_state: int = 42) -> dict:
        train_texts, temp_texts, train_labels, temp_labels = train_test_split(
            self.data['cleaned_text'].to_list(), self.data['labels'].to_list(), test_size=test_size, random_state=random_state
        )
        val_texts, test_texts, val_labels, test_labels = train_test_split(
            temp_texts, temp_labels, test_size=val_size, random_state=random_state
        )
        logger.info("Data split into test, train and validation sets ")
        return {
            'train': {'texts': train_texts, 'labels': train_labels},
            'val': {'texts': val_texts, 'labels': val_labels },
            'test': {'texts': test_texts, 'labels': test_labels}

        }

    def covert_to_tokenized_datasets(self, splits: dict) -> dict:

        #tokenize the texts
        train_encodings = self.tokenizer(splits['train']['texts'], truncation=True, padding=True, max_length=128)
        val_enocdings = self.tokenizer(splits['val']['texts'], truncation=True, padding=True, max_length=128)
        test_encodings = self.tokenizer(splits['test']['texts'], truncation=True, padding=True, max_length=128)

        #create pytorch datasets
        train_dataset = SentimentDataset(train_encodings, splits['train']['labels'])
        val_dataset = SentimentDataset(val_enocdings, splits['val']['labels'])
        test_dataset = SentimentDataset(test_encodings, splits['val']['labels'])

        logger.info("Pytorch datasets created succesffuly.")
        return {
            'train' : train_dataset,
            'val' : val_dataset,
            'test' : test_dataset
        }
    def save_datasets(self, train_dataset, val_dataset, test_dataset):
        #create the directory if the directory doesn't exists
        datasets_dir = Path(self.config.datasets_dir)
        datasets_dir.mkdir(parents=True, exist_ok=True)

        #save the datasets
        torch.save(train_dataset, datasets_dir / "train_dataset.pt")
        torch.save(val_dataset, datasets_dir / "val_dataset.pt")
        torch.save(test_dataset, datasets_dir / "test_dataset.pt")

        logger.info(f"Datasets saved to {datasets_dir}")

[2025-08-05 16:34:50,275: INFO: 3335293507: Text tokenization completed]


In [84]:
try:
    config = ConfigurationManager()
    get_data_pre_config = config.get_data_preprocessing_config()
    data_pre_process = DataPreprocessing(config=get_data_pre_config)

    #prepare the data
    data_pre_process.text_process()
    data_pre_process.mapping_labels_func()
    data_pre_process.tokenize_text()
    

    #save the preprocessed data
    data_pre_process.save_data()

    #splits the dataset into train, validation and test sets
    splits = data_pre_process.train_val_test_split()

    #converts split into PyTorch dataset
    tokenized_datasets = data_pre_process.covert_to_tokenized_datasets(splits)

    # Access the PyTorch datasets 
    train_dataset = tokenized_datasets['train']
    val_dataset = tokenized_datasets['val']
    test_dataset = tokenized_datasets['test']
    logger.info("PyTorch data created successfully")

    datasets = data_pre_process.save_datasets(train_dataset, val_dataset, test_dataset)


except Exception as e:
    logger.error(f"Error during data preprocessing {e}")
    raise e


[2025-08-05 16:34:51,528: INFO: common: yaml file: /Users/satwik/Downloads/MLproj/airlines_sentiment_classification/config/config.yaml loaded successfully]
[2025-08-05 16:34:51,533: INFO: common: yaml file: /Users/satwik/Downloads/MLproj/airlines_sentiment_classification/params.yaml loaded successfully]
[2025-08-05 16:34:51,535: INFO: common: created directory at: artifacts]
[2025-08-05 16:34:51,535: INFO: common: created directory at: artifacts/feature_engineering]
[2025-08-05 16:34:52,133: INFO: 3335293507: Dataset loaded from artifacts/feature_engineering/Tweets.csv]
[2025-08-05 16:36:01,692: INFO: 3335293507: Text processing completed]
[2025-08-05 16:36:01,699: INFO: 3335293507: Labels mapped to numerical values]
[2025-08-05 16:36:08,691: INFO: 3335293507: Preprocessed data saved to artifacts/feature_engineering/cleaned_tweets.csv]
[2025-08-05 16:36:08,700: INFO: 3335293507: Data split into test, train and validation sets ]
[2025-08-05 16:36:10,264: INFO: 3335293507: Pytorch datase

In [17]:
%pwd

'/Users/satwik/Downloads/MLproj/airlines_sentiment_classification'