In [1]:
# Cell 1 - Import necessary libraries
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime
import logging

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.cleaning_utils import (
    clean_text,
    standardize_date,
    remove_duplicates,
    validate_data
)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/cleaning.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [2]:
# Cell 2 - Load and examine raw data
# Load the data
df = pd.read_csv('telegram_data.csv')

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nDuplicate Rows:", df.duplicated().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   channel     400 non-null    object
 1   date        400 non-null    object
 2   text        262 non-null    object
 3   has_image   400 non-null    bool  
 4   message_id  400 non-null    int64 
 5   image_path  350 non-null    object
dtypes: bool(1), int64(1), object(4)
memory usage: 16.1+ KB
None

Missing Values:
channel         0
date            0
text          138
has_image       0
message_id      0
image_path     50
dtype: int64

Duplicate Rows: 0


In [3]:
# Cell 3 - Clean and transform data
def clean_dataset(df):
    """Main function to clean and transform the dataset"""
    logger.info("Starting data cleaning process...")
    
    # Create a copy of the dataframe
    df_cleaned = df.copy()
    
    # Clean text data
    logger.info("Cleaning text data...")
    df_cleaned['text'] = df_cleaned['text'].apply(clean_text)
    
    # Standardize dates
    logger.info("Standardizing dates...")
    df_cleaned['date'] = df_cleaned['date'].apply(standardize_date)
    
    # Remove duplicates
    logger.info("Removing duplicates...")
    df_cleaned = remove_duplicates(df_cleaned)
    
    # Handle missing values
    logger.info("Handling missing values...")
    valid_mask = validate_data(df_cleaned)
    df_cleaned = df_cleaned[valid_mask].copy()
    
    # Add metadata columns
    df_cleaned['cleaned_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df_cleaned['cleaning_version'] = '1.0'
    
    return df_cleaned

# Clean the dataset
df_cleaned = clean_dataset(df)

# Save cleaned data
os.makedirs('data/cleaned', exist_ok=True)
df_cleaned.to_csv('data/cleaned/cleaned_telegram_data.csv', index=False)

2025-01-31 11:26:16,683 - __main__ - INFO - Starting data cleaning process...
2025-01-31 11:26:16,688 - __main__ - INFO - Cleaning text data...
2025-01-31 11:26:16,752 - __main__ - INFO - Standardizing dates...
2025-01-31 11:26:17,597 - __main__ - INFO - Removing duplicates...
2025-01-31 11:26:17,602 - src.utils.cleaning_utils - INFO - Removed 52 duplicate entries
2025-01-31 11:26:17,604 - __main__ - INFO - Handling missing values...


In [4]:
# Cell 4 - Data validation and quality checks
def run_quality_checks(df_cleaned):
    """Run data quality checks on cleaned dataset"""
    checks = {
        "No missing values": df_cleaned.isnull().sum().sum() == 0,
        "No duplicates": df_cleaned.duplicated().sum() == 0,
        "Valid dates": pd.to_datetime(df_cleaned['date']).notnull().all(),
        "Non-empty text": df_cleaned['text'].str.strip().str.len().gt(0).all()
    }
    
    print("Data Quality Checks:")
    for check, result in checks.items():
        print(f"{check}: {'✓' if result else '✗'}")
    
    return all(checks.values())

# Run quality checks
quality_passed = run_quality_checks(df_cleaned)

Data Quality Checks:
No missing values: ✗
No duplicates: ✓
Valid dates: ✓
Non-empty text: ✓
