In [14]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import yaml
import logging

# logging configuration
logger = logging.getLogger('data_ingestion')
logger.setLevel('DEBUG')

console_handler = logging.StreamHandler()
console_handler.setLevel('DEBUG')

file_handler = logging.FileHandler('errors.log')
file_handler.setLevel('ERROR')

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

logger.addHandler(console_handler)
logger.addHandler(file_handler)

def load_params(params_path: str) -> dict:
    """Load parameters from a YAML file."""
    try:
        with open(params_path, 'r') as file:
            params = yaml.safe_load(file)
        logger.debug('Parameters retrieved from %s', params_path)
        return params
    except FileNotFoundError:
        logger.error('File not found: %s', params_path)
        raise
    except yaml.YAMLError as e:
        logger.error('YAML error: %s', e)
        raise
    except Exception as e:
        logger.error('Unexpected error: %s', e)
        raise

def load_data(data_url: str) -> pd.DataFrame:
    """Load data from a CSV file."""
    try:
        print("load data not")
        print(data_url)
        df = pd.read_csv(data_url)
        print(df.shape)
        logger.debug('Data loaded from %s', data_url)
        return df
    except pd.errors.ParserError as e:
        logger.error('Failed to parse the CSV file: %s', e)
        raise
    except Exception as e:
        logger.error('Unexpected error occurred while loading the data: %s', e)
        raise

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the data."""
    try:
        df.drop(columns=['tweet_id'], inplace=True)
        final_df = df[df['sentiment'].isin(['happiness', 'sadness'])]
        final_df['sentiment'].replace({'happiness': 1, 'sadness': 0}, inplace=True)
        logger.debug('Data preprocessing completed')
        return final_df
    except KeyError as e:
        logger.error('Missing column in the dataframe: %s', e)
        raise
    except Exception as e:
        logger.error('Unexpected error during preprocessing: %s', e)
        raise

def save_data(train_data: pd.DataFrame, test_data: pd.DataFrame, data_path: str) -> None:
    """Save the train and test datasets."""
    try:
        raw_data_path = os.path.join(data_path, 'raw')
        os.makedirs(raw_data_path, exist_ok=True)
        train_data.to_csv(os.path.join(raw_data_path, "train.csv"), index=False)
        test_data.to_csv(os.path.join(raw_data_path, "test.csv"), index=False)
        logger.debug('Train and test data saved to %s', raw_data_path)
    except Exception as e:
        logger.error('Unexpected error occurred while saving the data: %s', e)
        raise

def main():
    try:
        params = load_params(params_path='../params.yaml')
        test_size = params['data_ingestion']['test_size']
        print(test_size)
        df = load_data(data_url='https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')
        final_df = preprocess_data(df)
        train_data, test_data = train_test_split(final_df, test_size=test_size, random_state=42)
        save_data(train_data, test_data, data_path='/data/raw')
    except Exception as e:
        logger.error('Failed to complete the data ingestion process: %s', e)
        print(f"Error: {e}")

In [15]:
if __name__ == '__main__':
    main()

2024-07-03 16:48:42,363 - data_ingestion - DEBUG - Parameters retrieved from ../params.yaml
2024-07-03 16:48:42,363 - data_ingestion - DEBUG - Parameters retrieved from ../params.yaml
2024-07-03 16:48:42,363 - data_ingestion - DEBUG - Parameters retrieved from ../params.yaml
2024-07-03 16:48:42,363 - data_ingestion - DEBUG - Parameters retrieved from ../params.yaml
2024-07-03 16:48:42,363 - data_ingestion - ERROR - Failed to complete the data ingestion process: 'data_ingestion'
2024-07-03 16:48:42,363 - data_ingestion - ERROR - Failed to complete the data ingestion process: 'data_ingestion'
2024-07-03 16:48:42,363 - data_ingestion - ERROR - Failed to complete the data ingestion process: 'data_ingestion'
2024-07-03 16:48:42,363 - data_ingestion - ERROR - Failed to complete the data ingestion process: 'data_ingestion'


Error: 'data_ingestion'


In [5]:
df = load_data(data_url='https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')

load data not
https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv
(40000, 3)


In [6]:
df.shape

(40000, 3)

In [7]:
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [18]:
os.getcwd()

'C:\\Users\\comp\\Desktop\\Python\\mlops\\sentiment\\notebooks'

In [19]:
from pathlib import Path
path = Path(os.getcwd())
print(path.parent.absolute())

C:\Users\comp\Desktop\Python\mlops\sentiment


In [22]:
path.parent

WindowsPath('C:/Users/comp/Desktop/Python/mlops/sentiment')