In [37]:
import pandas as pd
import numpy as np
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import logging

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [38]:
# logging configuration
logger = logging.getLogger('data_transformation')
logger.setLevel('DEBUG')

console_handler = logging.StreamHandler()
console_handler.setLevel('DEBUG')

file_handler = logging.FileHandler('transformation_errors.log')
file_handler.setLevel('ERROR')

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

logger.addHandler(console_handler)
logger.addHandler(file_handler)


In [39]:
try:
    # Define the data path
    path = os.path.join("..", "data", "raw")
    logging.info(f"Looking for data in path: {path}")

    # Load train data
    train_path = os.path.join(path, "train_data.csv")
    train_data = pd.read_csv(train_path)
    logging.info(f"Train data loaded successfully from: {train_path} (shape: {train_data.shape})")

    # Load test data
    test_path = os.path.join(path, "test_data.csv")
    test_data = pd.read_csv(test_path)
    logging.info(f"Test data loaded successfully from: {test_path} (shape: {test_data.shape})")

except FileNotFoundError as fnf_error:
    logging.error(f"File not found: {fnf_error}")
except pd.errors.EmptyDataError as ede:
    logging.error(f"Empty file encountered: {ede}")
except Exception as e:
    logging.error(f"An error occurred while loading the data: {e}")

In [40]:
train_data.head()

Unnamed: 0,content,label
0,welp youtube just gonna let soulless corporati...,0.0
1,again said you are into deep propaganda\nmodi ...,1.0
2,have subscribed here for some fun this series ...,1.0
3,egoistic priyankavadramayawati mamtanaidu will...,-1.0
4,that innocent janata and chatur modi,1.0


In [41]:
def lower_case(df: pd.Series) -> pd.Series:
    """Convert text to lower case."""
    try:
        return df.str.lower()
    except Exception as e:
        print(f"Error in lower_case: {e}")
        return df

def remove_special_characters(df: pd.Series) -> pd.Series:
    """Remove special characters from the text."""
    try:
        return df.str.replace(r"[^a-zA-Z0-9]", " ", regex=True)
    except Exception as e:
        print(f"Error in remove_special_characters: {e}")
        return df

def remove_extra_spaces(df: pd.Series) -> pd.Series:
    """Remove extra spaces from the text."""
    try:
        return df.str.replace(r"\s+", " ", regex=True)
    except Exception as e:
        print(f"Error in remove_extra_spaces: {e}")
        return df

def remove_leading_trailing_spaces(df: pd.Series) -> pd.Series:
    """Remove leading and trailing spaces from the text."""
    try:
        return df.str.strip()
    except Exception as e:
        print(f"Error in remove_leading_trailing_spaces: {e}")
        return df

def remove_numbers(df: pd.Series) -> pd.Series:
    """Remove numbers from the text."""
    try:
        return df.str.replace(r"\d+", "", regex=True)
    except Exception as e:
        print(f"Error in remove_numbers: {e}")
        return df

def remove_stopwords(df: pd.Series) -> pd.Series:
    """Remove stopwords from the text."""
    try:
        stop_words = set(stopwords.words('english'))  - {'not', 'but', 'however', 'no', 'yet'}
        return df.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
    except Exception as e:
        print(f"Error in remove_stopwords: {e}")
        return df

def remove_punctuation(df: pd.Series) -> pd.Series:
    """Remove punctuation from the text."""
    try:
        return df.str.replace(r"[^\w\s]", "", regex=True)
    except Exception as e:
        print(f"Error in remove_punctuation: {e}")
        return df

def lemmatize_words(df: pd.Series) -> pd.Series:
    """Lemmatize the text."""
    try:
        lemmatizer = WordNetLemmatizer()
        return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    except Exception as e:
        print(f"Error in lemmatize_words: {e}")
        return df
    
def remove_empty_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Remove empty rows from the DataFrame."""
    try:
        df = df[~(df['content'].str.strip() == "")]
        return df
    except Exception as e:
        print(f"Error in remove_empty_rows: {e}")
        return df

def remove_urls(text_series: pd.Series) -> pd.Series:
    """Remove URLs from a Pandas Series of text."""
    try:
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        return text_series.str.replace(url_pattern, '', regex=True)
    except Exception as e:
        logging.error(f"Error in remove_urls: {e}")
        return text_series

In [42]:
# function for preprocessing the data1
def preprocess_data(data1):
    try:
        # apply all preprocessing functions to the content column
        logging.info("Starting data1 preprocessing...")

        data1 = data1.copy()
        
        data1.loc[:, 'content'] = data1.loc[:, 'content'].astype(str)  

        data1 = remove_empty_rows(data1)
        data1.loc[:, 'content'] = lower_case(data1['content'])
        data1.loc[:, 'content'] = remove_extra_spaces(data1['content'])
        data1.loc[:, 'content'] = remove_leading_trailing_spaces(data1['content'])
        data1.loc[:, 'content'] = remove_special_characters(data1['content'])
        data1.loc[:, 'content'] = lemmatize_words(data1['content'])
        data1.loc[:, 'content'] = remove_numbers(data1['content'])
        data1.loc[:, 'content'] = remove_urls(data1['content'])
        data1.loc[:, 'content'] = remove_stopwords(data1['content'])
        data1.loc[:, 'content'] = remove_punctuation(data1['content'])


        
        logging.info("data1 preprocessing completed.")
        
        return data1
    except Exception as e:
        logging.error(f"Error in preprocess_data: {e}")
        return data1

In [43]:
# apply the preprocessing function to the train and test data
train_processed_data = preprocess_data(train_data)
print("Preprocessed train data:")
train_processed_data.head()

Preprocessed train data:


Unnamed: 0,content,label
0,welp youtube gonna let soulless corporation do...,0.0
1,said deep propaganda modi wrote personal lette...,1.0
2,subscribed fun series world thanks please vote...,1.0
3,egoistic priyankavadramayawati mamtanaidu pull...,-1.0
4,innocent janata chatur modi,1.0


In [44]:
test_processed_data = preprocess_data(test_data)
print("Preprocessed test data:")
test_processed_data.head()

Preprocessed test data:


Unnamed: 0,content,label
0,anyone think modi strengthened democracy not s...,0.0
1,announcing type scheme make confident modi gonna,1.0
2,shri narendra modi want make every indian capa...,1.0
3,major reason modi youth failure modi,-1.0
4,jimrmodi need not spacehe capable enough deal ...,1.0


In [45]:
# Check newlines in the processed data
test_processed_data[test_processed_data['content'].str.contains("\n")]

Unnamed: 0,content,label


In [46]:
# Check newlines in the processed data
train_processed_data[train_processed_data['content'].str.contains("\n")]

Unnamed: 0,content,label


In [47]:
train_processed_data.isna().sum(), test_processed_data.isna().sum()

(content    0
 label      0
 dtype: int64,
 content    0
 label      0
 dtype: int64)

In [48]:
# Export the processed data to CSV files

## Train data
train_processed_path = os.path.join("..", "data", "interim", "train_processed.csv")
train_processed_data.to_csv(train_processed_path, index=False)

## Test data
test_processed_path = os.path.join("..", "data", "interim", "test_processed.csv")
test_processed_data.to_csv(test_processed_path, index=False)
logging.info(f"Processed train data saved to: {train_processed_path}")