In [None]:
import pandas as pd
import ast
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

df=pd.read_csv('amazon_scrape_data.csv')

def clean_delivery(col):
    try:
        return str(col)[2:15]
    except:
        return "Info not available"

def change_datatype(df):
    df['extracted_price'] = df['extracted_price'].astype(float)
    df['rating'] = df['rating'].astype(float)
    df['reviews'] = df['reviews'].astype(float)
    df['scrape_date'] = pd.to_datetime(df['scrape_date'])
    return df

def parse_specs(spec_str):
    DEFAULT_KEYS = ['display_size', 'ram', 'disk_size', 'operating_system']
    try:
        spec_dict = ast.literal_eval(spec_str)
        return pd.Series({key: spec_dict.get(key) for key in DEFAULT_KEYS})
    except:
        return pd.Series({key: None for key in DEFAULT_KEYS})

def clean_spec_values(df):
    """
    Clean and unify inconsistent spec values (like None, '-', '16', etc.)
    """
    for col in ['display_size', 'ram', 'disk_size', 'operating_system']:
        df[col] = df[col].replace(['-', 'None', 'none', '', 'nan', 'NaN'], None)
        df[col] = df[col].fillna("Info not available")
    return df

def data_cleaning(df):
    df = df[['rating', 'reviews', 'extracted_price', 'asin', 'title', 'link_clean', 'thumbnail', 'delivery', 'scrape_date', 'specs']]

    # Step 1: Parse specs
    specs_df = df['specs'].apply(parse_specs)
    df = pd.concat([df.drop(columns=['specs']), specs_df], axis=1)

    # Step 2: Clean spec fields
    df = clean_spec_values(df)

    # Step 3: Convert to correct data types
    df = change_datatype(df)

    # Step 4: Clean delivery info
    df['delivery'] = df['delivery'].fillna('Info not available').apply(clean_delivery)

    # Step 5: Impute only numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

    # step:6 fill missing categorical columns with 'Info not available'
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    df[categorical_cols] = df[categorical_cols].fillna("Info not available")

    # step 7 : handling duplicated
    df = df.drop_duplicates(subset=["asin"], keep="first")
    
    df.to_csv('cleaned_Data.csv')
    return df

