In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('amazon_laptop_Data2025-08-01.csv')

In [18]:
import pandas as pd
import ast
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

def clean_delivery(col):
    return col[2:15]

def change_datatype(df):
    df['extracted_price'] = df['extracted_price'].astype(float)
    df['rating'] = df['rating'].astype(float)
    df['reviews'] = df['reviews'].astype(float)
    df['scrape_date'] = pd.to_datetime(df['scrape_date'])
    return df

def parse_specs(spec_str):
    DEFAULT_KEYS = ['display_size', 'ram', 'disk_size', 'operating_system']
    try:
        spec_dict = ast.literal_eval(spec_str)
        return pd.Series({key: spec_dict.get(key) for key in DEFAULT_KEYS})
    except:
        return pd.Series({key: None for key in DEFAULT_KEYS})

def clean_spec_values(df):
    """
    Clean and unify inconsistent spec values (like None, '-', '16', etc.)
    """
    for col in ['display_size', 'ram', 'disk_size', 'operating_system']:
        df[col] = df[col].replace(['-', 'None', 'none', '', 'nan', 'NaN'], None)
        df[col] = df[col].fillna("Info not available")
    return df

def data_cleaning(df):
    df = df[['rating', 'reviews', 'extracted_price', 'asin', 'title', 'link_clean', 'thumbnail', 'delivery', 'scrape_date', 'specs']]

    # Step 1: Parse specs
    specs_df = df['specs'].apply(parse_specs)
    df = pd.concat([df.drop(columns=['specs']), specs_df], axis=1)

    # Step 2: Clean spec fields
    df = clean_spec_values(df)

    # Step 3: Convert to correct data types early
    df = change_datatype(df)

    # Step 4: Clean delivery column
    # df['delivery'] = df['delivery'].apply(clean_delivery)

    # Step 5: Impute only numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    transformer = ColumnTransformer(
        transformers=[
            ('num_imputer', SimpleImputer(strategy='mean'), numerical_cols)
        ],
        remainder='passthrough'
    )

    transformed_data = transformer.fit_transform(df)

    # Fix column names properly
    passthrough_cols = df.columns.difference(numerical_cols).tolist()
    new_cols = numerical_cols + passthrough_cols
    df_imputed = pd.DataFrame(transformed_data, columns=new_cols, index=df.index)

    # Step 6: Final null check
    print("Nulls after cleaning:")
    print(df_imputed.isnull().sum())
    print("-" * 100)

    df_imputed.to_csv('cleaned_data.csv', index=False)
    return df_imputed


In [16]:
x=data_cleaning(df)

Nulls after cleaning:
rating              0
reviews             0
extracted_price     0
asin                0
delivery            0
disk_size           0
display_size        0
link_clean          8
operating_system    0
ram                 0
scrape_date         0
thumbnail           0
title               0
dtype: int64
----------------------------------------------------------------------------------------------------


In [12]:
x['ram'].value_counts()

ram
-           147
16 GB        84
32 GB        64
4 GB         25
8 GB         21
20 GB        17
40 GB        16
None         14
4             9
36 GB         8
64 GB         5
24 GB         5
16            4
16.00 GB      3
12 GB         3
0 GB          2
0 TB          1
24            1
12            1
32.00 GB      1
16.0 GB       1
Name: count, dtype: int64

In [17]:
pd.read_csv('cleaned_data.csv').isnull().sum()

rating              0
reviews             0
extracted_price     0
asin                0
delivery            0
disk_size           0
display_size        0
link_clean          8
operating_system    0
ram                 0
scrape_date         0
thumbnail           0
title               0
dtype: int64