In [16]:
# Setup import paths
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
# Import our modules
import ingestion
import preprocessing

In [None]:
# SCRAPE
output_file = await ingestion.scrape()
print(f"Raw data saved: {output_file}")

In [17]:
from pathlib import Path
import pandas as pd
import numpy as np

# Step 1: Get notebook's current working directory
base_dir = Path.cwd()

# Step 2: Define full file paths
raw_path1 = base_dir.parent / "data/raw/scrapped_data_3.csv"
raw_path2 = base_dir.parent / "data/raw/raw_20250619_150258.csv"

# Step 3: Load files as DataFrames
df1 = pd.read_csv(raw_path1)
df2 = pd.read_csv(raw_path2)

# Step 4: Rename columns to match
df1.rename(columns={
    'Channel Title': 'channel_name',
    'Message': 'message',
    'Date': 'timestamp',
    'Media Path': 'media_file'
}, inplace=True)

df2.rename(columns={
    'channel': 'channel_name',
    'message': 'message',
    'timestamp': 'timestamp',
    'media_file': 'media_file'
}, inplace=True)

# Step 5: Drop unnecessary columns
df1.drop(columns=['Channel Username', 'ID'], inplace=True, errors='ignore')
df2.drop(columns=['sender_id'], inplace=True, errors='ignore')

# Step 6: Tag source and fill in missing views for df1
df1["source_file"] = "scrapped_data_3.csv"
df2["source_file"] = "raw_20250619_150258.csv"

np.random.seed(42)
df1["views"] = np.random.randint(100, 4000, size=len(df1))

# Step 7: Merge
merged_df = pd.concat([df1, df2], ignore_index=True, sort=False)

# Step 8: Clean
merged_df['message'] = merged_df['message'].astype(str).str.strip()
merged_df = merged_df[merged_df['message'].str.len() > 5]
merged_df.drop_duplicates(subset=["message", "timestamp"], inplace=True)
merged_df.reset_index(drop=True, inplace=True)

# Step 9: Save
output_path = base_dir.parent / "data/raw/merged_cleaned.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)
merged_df.to_csv(output_path, index=False)

print(" Merged and cleaned dataset saved to:", output_path)
print(" Final shape:", merged_df.shape)
print(" Columns:", merged_df.columns.tolist())


 Merged and cleaned dataset saved to: /home/nurye/Desktop/10_Academy/week_4/Amharic-E-commerce-Data-Extractor/data/raw/merged_cleaned.csv
 Final shape: (20779, 6)
 Columns: ['channel_name', 'message', 'timestamp', 'media_file', 'source_file', 'views']


In [19]:
# PREPROCESS
processed_file = preprocessing.preprocess('../data/raw/merged_cleaned.csv')

print(f"Processed file saved at: {processed_file}")

Processed file saved at: ../data/processed/processed_merged_cleaned.csv


In [23]:
#lets make conll file
import pandas as pd
import ast
import re

processed_file = '../data/processed/processed_merged_cleaned.csv'
df = pd.read_csv(processed_file)

print(f"Loaded {len(df)} messages.")


Loaded 20779 messages.


In [32]:
import pandas as pd
import regex as re
from sacremoses import MosesTokenizer
tokenizer = MosesTokenizer(lang='am')
def is_amharic_or_number(token):
    """Keep only Amharic script and digits"""
    return bool(re.match(r'^[\p{IsEthiopic}\d፡።]+$', token))


def auto_label(tokens):
    labeled = []
    prev_label = None
    amharic_token_count = 0
    price_mode = False
    price_count = 0
    loc_mode = False

    for i, tok in enumerate(tokens):
        if not is_amharic_or_number(tok):
            continue

        # PRODUCT: First 2 Amharic tokens if first is not number
        if len(labeled) == 0:
            if re.fullmatch(r'\d+', tok):
                labeled.append((tok, 'O'))
                continue
            else:
                labeled.append((tok, 'B-PRODUCT'))
                amharic_token_count = 1
                continue
        elif amharic_token_count == 1:
            labeled.append((tok, 'I-PRODUCT'))
            amharic_token_count += 1
            continue

        # PRICE: Trigger on 'ዋጋ' 
        if tok == 'ዋጋ':
            labeled.append((tok, 'B-PRICE'))
            price_mode = True
            price_count = 0
            continue
        elif price_mode:
            price_count += 1
            labeled.append((tok, 'I-PRICE'))

            # Check if this is a price-ending token
            if tok in {'ብር', 'ብ', 'ር'}:
                price_mode = False
            elif price_count >= 3:
                price_mode = False
            continue

        # LOC: Trigger on 'አድራሻ' or 'አድራሻችን'
        if tok in {'አድራሻ', 'አድራሻችን'}:
            labeled.append((tok, 'B-LOC'))
            loc_mode = True
            continue
        elif loc_mode:
            labeled.append((tok, 'I-LOC'))
            if tok in {'ፎቅ', 'ህንፃ'}:
                loc_mode = False
            continue

        # DEFAULT 
        labeled.append((tok, 'O'))

    return labeled




def process_and_save_conll(input_csv, output_file, sample_size=None):
    df = pd.read_csv(input_csv)

    if sample_size:
        df = df.sample(n=sample_size, random_state=42)

    with open(output_file, 'w', encoding='utf-8') as f:
        for message in df['clean_message']:
            tokens = tokenizer.tokenize(str(message))
            labeled_tokens = auto_label(tokens)
            for tok, tag in labeled_tokens:
                f.write(f"{tok} {tag}\n")
            f.write("\n")  # blank line between sentences

    print(f" CoNLL file saved to: {output_file}")


if __name__ == "__main__":
    process_and_save_conll(
        input_csv="../data/processed/processed_merged_cleaned.csv",
        output_file="../data/labeled/ner_auto_labels.conll",
        sample_size=100  
    )


 CoNLL file saved to: ../data/labeled/ner_auto_labels.conll


In [33]:
# Preview labeling on first row
tokens = ast.literal_eval(df.iloc[0]['tokens'])
labeled = auto_label(tokens)

for tok, label in labeled:
    print(f"{tok}\t{label}")


ባለሁለት	B-PRODUCT
ምድጃ	I-PRODUCT
ስቶቭ	O
2000	O
ዋት	O
ፊውዝ	O
የተገጠመለት	O
ትልቅ	O
ድስት	O
መሸከም	O
የሚችል	O
አስተማማኝ	O
ቴርሞስታት	O
ባለ	O
ፊውዝ	O
ዋጋ	B-PRICE
፦	I-PRICE
ትልቁ	I-PRICE
2900ብር	I-PRICE
አድራሻ	B-LOC
መገናኛ	I-LOC
ስሪ	I-LOC
ኤም	I-LOC
ሲቲ	I-LOC
ሞል	I-LOC
ሁለተኛ	I-LOC
ፎቅ	I-LOC
ቢሮ	O
ቁ	O
ሊፍቱ	O
ፊት	O
ለ	O
ፊት	O
ለቡ	O
መዳህኒዓለም	O
ቤተክርስቲያን	O
ፊት	O
ለፊት	O
ዛም	O
ሞል	O
2ኛ	O
ፎቅ	O
ቢሮ	O
ለቡ	O
ቅርንጫፍ0973611819	O
0909522840	O
0923350054	O
ለማዘዝ	O
ይጠቀሙ	O
ለተጨማሪ	O
ማብራሪያ	O
የቴሌግራም	O
ገፃችን	O
