# EthioMart Telegram Data – Pre-processing Notebook

This notebook loads the raw scraped CSV (in `data/raw/`), performs text cleaning and enrichment, and writes a pre-processed version to `data/preprocessed/`.


In [1]:
from pathlib import Path
from datetime import datetime
import pandas as pd
import re, sys, os


In [3]:
def clean_amharic_text(text: str) -> str:
    """Light normalisation for Amharic text. Removes control chars, collapses spaces, keeps common punctuation."""
    text = re.sub(r'[\r\n]+', ' ', str(text))
    text = re.sub(r'[^\w\s።፥፣፤፦፧፡፠]', '', text, flags=re.UNICODE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:
RAW_DIR = Path('../data/raw')
PRE_DIR = Path('../data/preprocessed')
PRE_DIR.mkdir(parents=True, exist_ok=True)

csv_files = list(RAW_DIR.glob('telegram_data_*.csv'))
if not csv_files:
    sys.exit('No raw CSV files found – ensure you have already scraped data.')
latest = max(csv_files, key=lambda p: p.stat().st_mtime)
print(f'Using raw file: {latest}')
df = pd.read_csv(latest, encoding='utf-8-sig')
df.head()


Using raw file: ..\data\raw\telegram_data_20250623_125228.csv


Unnamed: 0,Channel Title,Channel Username,Message ID,Message,Date,Media Path
0,Shewa Brand,https://t.me/@Shewabrand,3714,የተለያዩ ጫማዎች በፍሬ መምረጥ ማስመረጥ ለምትፈልጉ ደንበኞቻችን አዲስ ነ...,2025-06-22T07:20:07+00:00,photos\@Shewabrand_3714.jpg
1,Shewa Brand,https://t.me/@Shewabrand,3713,NIKE SB FC original 💯 \nSize 40#41#42#43#44#45...,2025-06-21T09:28:21+00:00,photos\@Shewabrand_3713.jpg
2,Shewa Brand,https://t.me/@Shewabrand,3712,ORIGINAL COTTON TUTA💯 original \nSize L#XL#2XL...,2025-06-21T05:05:45+00:00,photos\@Shewabrand_3712.jpg
3,Shewa Brand,https://t.me/@Shewabrand,3711,ZARA CLUB COTTON TISHERTS 💯 original \nSize M#...,2025-06-20T07:57:43+00:00,photos\@Shewabrand_3711.jpg
4,Shewa Brand,https://t.me/@Shewabrand,3710,jordan 1 original 💯 \nSize 40#41#42#43\nMADE I...,2025-06-20T06:15:40+00:00,photos\@Shewabrand_3710.jpg


In [None]:
# Ensure Clean Text column exists / recompute
df['Clean Text'] = df['Clean Text'].fillna('').apply(clean_amharic_text) if 'Clean Text' in df.columns else df['Message'].apply(clean_amharic_text)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
pre_path = PRE_DIR / f'telegram_data_preprocessed_{timestamp}.csv'
df.to_csv(pre_path, index=False, encoding='utf-8-sig')
print(f'Pre-processed data written to: {pre_path}')
df.head()


Pre-processed data written to: ..\data\preprocessed\telegram_data_preprocessed_20250625_142643.csv


Unnamed: 0,Channel Title,Channel Username,Message ID,Message,Date,Media Path,Clean Text
0,Shewa Brand,https://t.me/@Shewabrand,3714,የተለያዩ ጫማዎች በፍሬ መምረጥ ማስመረጥ ለምትፈልጉ ደንበኞቻችን አዲስ ነ...,2025-06-22T07:20:07+00:00,photos\@Shewabrand_3714.jpg,የተለያዩ ጫማዎች በፍሬ መምረጥ ማስመረጥ ለምትፈልጉ ደንበኞቻችን አዲስ ነ...
1,Shewa Brand,https://t.me/@Shewabrand,3713,NIKE SB FC original 💯 \nSize 40#41#42#43#44#45...,2025-06-21T09:28:21+00:00,photos\@Shewabrand_3713.jpg,NIKE SB FC original Size 404142434445 MADE IN ...
2,Shewa Brand,https://t.me/@Shewabrand,3712,ORIGINAL COTTON TUTA💯 original \nSize L#XL#2XL...,2025-06-21T05:05:45+00:00,photos\@Shewabrand_3712.jpg,ORIGINAL COTTON TUTA original Size LXL2XL3XL4X...
3,Shewa Brand,https://t.me/@Shewabrand,3711,ZARA CLUB COTTON TISHERTS 💯 original \nSize M#...,2025-06-20T07:57:43+00:00,photos\@Shewabrand_3711.jpg,ZARA CLUB COTTON TISHERTS original Size MLXLXX...
4,Shewa Brand,https://t.me/@Shewabrand,3710,jordan 1 original 💯 \nSize 40#41#42#43\nMADE I...,2025-06-20T06:15:40+00:00,photos\@Shewabrand_3710.jpg,jordan 1 original Size 40414243 MADE IN VIETNA...
