In [None]:
import pandas as pd
import re
import numpy as np

class PhoneFraudParser:
    def __init__(self, raw_data_path):
        self.df = pd.read_csv(raw_data_path)

    def extract_storage(self, text):
        if not isinstance(text, str):
            return None
            
        pattern = r'(16|32|64|128|256|512|1024)\s*(?:gb|giga|g)\b'
        match = re.search(pattern, text.lower())
        if match:
            return int(match.group(1))
        return None

    def clean_price_value(self, price_str):
        if pd.isna(price_str):
            return None
        
        s = str(price_str)
        s = s.replace('.', '')
        s = re.sub(r'[^\d]', '', s)
        
        try:
            return float(s)
        except ValueError:
            return None

    def run_pipeline(self):
        self.df['clean_price'] = self.df['price'].apply(self.clean_price_value)
        
        self.df['memory_size'] = self.df['title'].apply(self.extract_storage)
        
        mask_missing_memory = self.df['memory_size'].isna()
        self.df.loc[mask_missing_memory, 'memory_size'] = self.df.loc[mask_missing_memory, 'description'].apply(self.extract_storage)
        self.df['memory_size'] = self.df['memory_size'].fillna(0)

        group_cols = ['model_raw', 'memory_size', 'condition']
        
        self.df['market_median'] = self.df.groupby(group_cols)['clean_price'].transform('median')
        self.df['price_ratio'] = self.df['clean_price'] / self.df['market_median']
        self.df['price_ratio'] = self.df['price_ratio'].fillna(1.0) 

        final_df = self.df.rename(columns={
            'model_raw': 'phone_model',
            'num_photos': 'number_of_photos',
            'publisher': 'publisher_name'
        })

        final_columns = [
            'post_id',           
            'phone_model',       
            'memory_size',       
            'condition',          
            'number_of_photos',  
            'publisher_name',    
            'price_ratio',       
            'clean_price'        
        ]
        
        return final_df[final_columns]

In [None]:
file_path = 'raw_scraped_data.csv'

parser = PhoneFraudParser(file_path)
clean_data = parser.run_pipeline()

display(clean_data.head(10))