In [7]:
import pandas as pd
import numpy as np
import re

class PhoneFraudParser:
    def __init__(self, csv_path: str):
        self.csv_path = csv_path
        self.df_raw = None
        self.df_clean = None

    @staticmethod
    def extract(block, field):
        pattern = rf"{field}:(.*)"
        match = re.search(pattern, block)
        return match.group(1).strip() if match else ""

    @staticmethod
    def extract_price(price_str):
        if not price_str:
            return 0.0
        p = price_str.replace("lei", "").replace("Lei", "").replace(" ", "")
        p = re.sub(r"[^\d]", "", p)
        return float(p) if p.isdigit() else 0.0

    @staticmethod
    def extract_memory(text):
        if not isinstance(text, str):
            return 0
        m = re.search(r"(\d+)\s*gb", text.lower())
        return int(m.group(1)) if m else 0

    @staticmethod
    def extract_rating(text):
        m = re.search(r"([\d\.]+)\s*/\s*5", text)
        return float(m.group(1)) if m else 0.0

    @staticmethod
    def extract_num_ratings(text):
        m = re.search(r"(\d+)", text)
        return int(m.group(1)) if m else 0

    @staticmethod
    def parse_attributes(block):
        model = None
        condition = None

        if not isinstance(block, str):
            return model, condition

        parts = [p.strip() for p in block.split("|")]

        for p in parts:
            if "Model:" in p:
                model = p.replace("Model:", "").strip()
            if "Stare:" in p:
                condition = p.replace("Stare:", "").strip()

        return model, condition

    def load(self):
        self.df_raw = pd.read_csv(self.csv_path)
        return self

    def clean(self):
        rows = []

        for block in self.df_raw["Summary"]:
            title = self.extract(block, "Title")
            price_raw = self.extract(block, "Price")
            rating_raw = self.extract(block, "Rating")
            num_r_raw = self.extract(block, "NumÄƒr ratinguri")
            join_date = self.extract(block, "Cont OLX creat")
            photos_raw = self.extract(block, "Photos")
            description = self.extract(block, "Description")

            attr_match = re.search(r"Attributes:\s*\n(.*)", block)
            attributes_raw = attr_match.group(1).strip() if attr_match else ""
            model, condition = self.parse_attributes(attributes_raw)

            mem = self.extract_memory(title)
            if mem == 0:
                mem = self.extract_memory(description)

            rows.append({
                "phone_model": model,
                "memory_size": mem,
                "condition": condition,
                "number_of_photos": int(photos_raw) if photos_raw.isdigit() else 0,
                "publisher_rating": self.extract_rating(rating_raw),
                "publisher_num_ratings": self.extract_num_ratings(num_r_raw),
                "publisher_join_date": join_date,
                "clean_price": self.extract_price(price_raw)
            })

        df = pd.DataFrame(rows)

        df["median_price"] = (
            df.groupby(["phone_model", "memory_size", "condition"])["clean_price"]
              .transform(lambda g: g.replace(0, np.nan).median() or 0)
        )

        df["price_ratio"] = df["clean_price"] / df["median_price"].replace(0, 1)

        score_price = (
              (df["price_ratio"] < 0.50) * 3
            + (df["price_ratio"].between(0.50, 0.70, inclusive="left")) * 2
            + (df["price_ratio"].between(0.70, 0.90, inclusive="left")) * 1
        )

        score_seller = (
              (df["publisher_num_ratings"] == 0) * 3
            + (df["publisher_rating"] < 4.0) * 1
        )

        score_photos = (
              (df["number_of_photos"] == 0) * 3
            + (df["number_of_photos"] == 1) * 2
            + (df["number_of_photos"] == 2) * 1
        )

        score_new_account = df["publisher_join_date"].astype(str).str.contains("2023") * 1

        score_condition = (
            df["condition"].fillna("").str.lower().str.contains("nou")
            & (df["price_ratio"] < 0.80)
        ) * 1

        score_memory = (
              (df["memory_size"] == 0) * 1
            + (df["memory_size"] > 512) * 1
        )

        score_missing_model = df["phone_model"].isna() * 1
        score_low_price = (df["clean_price"] < 300) * 1

        fraud_score = (
            score_price
          + score_seller
          + score_photos
          + score_new_account
          + score_condition
          + score_memory
          + score_missing_model
          + score_low_price
        )

        df["fraud_flag"] = (fraud_score >= 2).astype(int)

        df = df.drop(columns=["median_price"])

        self.df_clean = df
        return self

    def get_clean_df(self):
        return self.df_clean


In [8]:
parser = PhoneFraudParser("/home/haidau_rares/projects/fraud_detection_iphones/olx_ads_data.csv")
parser.load().clean()

df_clean = parser.get_clean_df()
df_clean.to_csv("clean_data.csv", index=False)

df_clean.head()


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,phone_model,memory_size,condition,number_of_photos,publisher_rating,publisher_num_ratings,publisher_join_date,clean_price,price_ratio,fraud_flag
0,iPhone 1st gen,256,Utilizat,3,4.7,29,octombrie 2023,6500.0,1.420765,0
1,iPhone 1st gen,512,Utilizat,2,5.0,55,iulie 2020,7300.0,1.0,0
2,iPhone 13,128,Utilizat,5,4.7,29,octombrie 2023,1600.0,1.0,0
3,iPhone 12 Mini,128,Utilizat,5,4.7,29,octombrie 2023,700.0,0.823529,1
4,iPhone 13 Pro,256,Utilizat,5,4.7,29,octombrie 2023,2100.0,1.02439,0
