In [93]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [94]:
# Import necessary libraries
import pandas as pd
import os

# Read the CSV file
# Read all motorcycle CSV files
csv_files = [f for f in os.listdir('../data/formatted/') if f.startswith('motorcycle') and f.endswith('.csv')]
dfs = []
for file in csv_files:
    df_temp = pd.read_csv(f'../data/formatted/{file}')
    dfs.append(df_temp)
    
# Combine all dataframes
df = pd.concat(dfs, ignore_index=True)

# Display first few rows to verify data loading
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (3265, 22)


Unnamed: 0,Reference Price (Yuan),Brand,Model,Production Method,Origin,Engine,Max Horsepower (Ps),Max Power/Speed (kW/rpm),Max Torque/Speed (N·m/rpm),Transmission,...,Dimensions (LxWxH mm),Seat Height (mm),Curb Weight (kg),Fuel Tank Capacity (L),Max Speed (km/h),Official Average Fuel Consumption (L/100km),Measured Average Fuel Consumption (L/100km),Range (km),Warranty,Available Colors
0,,,,,,,,,,,...,,,,,,,,,,
1,Currently no quotation available,titanium,pedal,,,single cylinder four stroke water-cooled 150cc,15.8,11.6/8500,,,...,1970x770x1150,765.0,136.0,8.0,105.0,-,,-,,
2,Currently no quotation available,titanium,pedal,,,single cylinder four stroke water-cooled 150cc,15.8,11.6/8500,,CVT continuously variable transmission automat...,...,1920x740x1120,785.0,130.0,8.0,100.0,-,,-,,Titanium Extreme Silver
3,Currently no quotation available,titanium,pedal,,,single cylinder four stroke water-cooled 300cc,25.2,18.5/8250,,CVT continuously variable transmission automat...,...,1900x790x1200,775.0,150.0,11.0,118.0,-,,-,,Dark Night Black
4,Currently no quotation available,titanium,", street car,",,,inline dual cylinder water-cooled 250cc,32.6,24/9500,,-,...,2100x805x1060,710.0,168.0,13.0,120.0,-,,-,,Titanium Extreme Silver


In [95]:
# Clean string data by removing leading/trailing whitespace and special characters
def clean_string(value):
    if pd.isna(value) or not isinstance(value, str):
        return value
    # Remove leading/trailing whitespace
    value = value.strip()
    # Remove leading/trailing commas, dots, and other common special characters
    value = value.strip('.,;:\"\'')
    # Replace multiple spaces with single space
    value = ' '.join(value.split())
    return value

# Apply cleaning to all string (object) columns
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].apply(clean_string)
    
    # Print a sample of changes made (first 5 changes)
    mask = df[column] != df[column].apply(lambda x: x if pd.isna(x) else x.strip())
    if mask.any():
        print(f"\nCleaning results for {column}:")
        changes = pd.DataFrame({
            'Before': df[column][mask],
            'After': df[column].apply(lambda x: x if pd.isna(x) else x.strip())[mask]
        })
        print(changes.head())

# Remove any rows that are completely empty (all values are NaN)
initial_rows = len(df)
df = df.dropna(how='all')
rows_removed = initial_rows - len(df)
if rows_removed > 0:
    print(f"\nRemoved {rows_removed} completely empty rows")

print("\nCleaning complete!")



Cleaning results for Reference Price (Yuan):
    Before After
0      NaN   NaN
38     NaN   NaN
120    NaN   NaN
121    NaN   NaN
123    NaN   NaN

Cleaning results for Brand:
    Before After
0      NaN   NaN
123    NaN   NaN
124    NaN   NaN
125    NaN   NaN
127    NaN   NaN

Cleaning results for Model:
    Before After
0      NaN   NaN
60     NaN   NaN
106    NaN   NaN
107    NaN   NaN
108    NaN   NaN

Cleaning results for Production Method:
  Before After
0    NaN   NaN
1    NaN   NaN
2    NaN   NaN
3    NaN   NaN
4    NaN   NaN

Cleaning results for Origin:
  Before After
0    NaN   NaN
1    NaN   NaN
2    NaN   NaN
3    NaN   NaN
4    NaN   NaN

Cleaning results for Engine:
   Before After
0     NaN   NaN
23    NaN   NaN
44    NaN   NaN
77    NaN   NaN
78    NaN   NaN

Cleaning results for Max Power/Speed (kW/rpm):
   Before After
0     NaN   NaN
23    NaN   NaN
44    NaN   NaN
63    NaN   NaN
77    NaN   NaN

Cleaning results for Transmission:
   Before After
0     NaN   NaN
1

In [96]:
# Get total number of rows in the dataset
print(f"\nTotal number of rows: {len(df)}")

# Calculate and display number of NaN values for each column
nan_counts = df.isna().sum()
print("\nNumber of NaN values per column:")
print(nan_counts)

# Calculate percentage of NaN values
nan_percentages = (df.isna().sum() / len(df)) * 100
print("\nPercentage of NaN values per column:")
print(nan_percentages.round(2))



Total number of rows: 3265

Number of NaN values per column:
Reference Price (Yuan)                          539
Brand                                            42
Model                                           218
Production Method                              2246
Origin                                         2215
Engine                                          494
Max Horsepower (Ps)                            1010
Max Power/Speed (kW/rpm)                        999
Max Torque/Speed (N·m/rpm)                     3265
Transmission                                    472
ABS                                             704
CBS                                               0
Dimensions (LxWxH mm)                           690
Seat Height (mm)                               1434
Curb Weight (kg)                               1111
Fuel Tank Capacity (L)                          459
Max Speed (km/h)                               1343
Official Average Fuel Consumption (L/100km)     459
Me

In [97]:
# Drop columns with more than 90% NaN values
threshold = 0.9  # 90% threshold
columns_to_drop = nan_percentages[nan_percentages > 90].index
df = df.drop(columns=columns_to_drop)

# Remove rows where price is NaN
price_col = 'Reference Price (Yuan)'
initial_rows = len(df)
df = df.dropna(subset=[price_col])
rows_removed = initial_rows - len(df)
print(f"\nRemoved {rows_removed} rows with missing price values")

# Replace brand values less than 3 characters with 'Not Specified'
df.loc[df['Brand'].fillna('').str.len() < 3, 'Brand'] = 'Not Specified'

# Capitalize model values
df['Model'] = df['Model'].str.capitalize()


print("\nColumns dropped due to >90% missing values:")
print(columns_to_drop.tolist())

print("\nRemaining columns:")
print(df.columns.tolist())



Removed 539 rows with missing price values

Columns dropped due to >90% missing values:
['Max Torque/Speed (N·m/rpm)', 'Measured Average Fuel Consumption (L/100km)', 'Warranty']

Remaining columns:
['Reference Price (Yuan)', 'Brand', 'Model', 'Production Method', 'Origin', 'Engine', 'Max Horsepower (Ps)', 'Max Power/Speed (kW/rpm)', 'Transmission', 'ABS', 'CBS', 'Dimensions (LxWxH mm)', 'Seat Height (mm)', 'Curb Weight (kg)', 'Fuel Tank Capacity (L)', 'Max Speed (km/h)', 'Official Average Fuel Consumption (L/100km)', 'Range (km)', 'Available Colors']


In [98]:
# Get detailed info about remaining columns
print("Detailed analysis of remaining features:")
for column in full_df.columns:
    nan_pct = (full_df[column].isna().sum() / len(full_df)) * 100
    unique_vals = full_df[column].nunique()
    dtype = full_df[column].dtype
    
    print(f"\n{column}:")
    print(f"Missing values: {nan_pct:.2f}%")
    print(f"Data type: {dtype}")
    print(f"Number of unique values: {unique_vals}")
    if dtype in ['object', 'category']:
        print("Top 5 most common values:")
        print(full_df[column].value_counts().head())
    elif dtype in ['int64', 'float64']:
        print(f"Mean: {full_df[column].mean():.2f}")
        print(f"Median: {full_df[column].median():.2f}")

Detailed analysis of remaining features:

Reference Price (Yuan):
Missing values: 16.51%
Data type: object
Number of unique values: 681
Top 5 most common values:
Reference Price (Yuan)
Currently no quotation available    791
7980                                 46
6980                                 41
8980                                 28
23800                                23
Name: count, dtype: int64

Brand:
Missing values: 1.29%
Data type: object
Number of unique values: 281
Top 5 most common values:
Brand
Haojue       108
Zongshen      96
Guangyang     95
BIMOTA        85
Benelli       79
Name: count, dtype: int64

Model:
Missing values: 6.68%
Data type: object
Number of unique values: 15
Top 5 most common values:
Model
pedal                    1234
street car                521
Cruising Crown Prince     239
Off road                  190
Retro                     171
Name: count, dtype: int64

Production Method:
Missing values: 68.79%
Data type: object
Number of unique values:

In [99]:
# Create copy of dataframe for imputation
df_imputed = df.copy()

# 1. Identify price-related columns and tech spec columns
price_columns = [col for col in df_imputed.select_dtypes(include=['int64', 'float64']).columns 
                 if 'price' in col.lower()]
tech_spec_columns = [col for col in df_imputed.select_dtypes(include=['int64', 'float64']).columns 
                    if col not in price_columns]

# 2. Impute prices with median and create flag columns
for col in price_columns:
    # Create flag column before imputation
    df_imputed[f'{col}_is_imputed'] = df_imputed[col].isna()
    # Perform imputation
    df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())

# 3. Impute technical specifications with median and create flag columns
for col in tech_spec_columns:
    # Create flag column before imputation
    df_imputed[f'{col}_is_imputed'] = df_imputed[col].isna()
    # Perform imputation
    df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())

# 4. Impute categorical columns with 'Not Specified'
categorical_columns = df_imputed.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df_imputed[col] = df_imputed[col].fillna('Not Specified')

# Verify results
print("\nRemaining missing values after imputation:")
print(df_imputed.isna().sum())

# Show the new flag columns
flag_columns = [col for col in df_imputed.columns if col.endswith('_is_imputed')]
print("\nNew imputation flag columns:")
print(flag_columns)

# Show sample of rows where values were imputed
for col in price_columns + tech_spec_columns:
    imputed_rows = df_imputed[df_imputed[f'{col}_is_imputed']]
    if len(imputed_rows) > 0:
        print(f"\nSample of imputed values for {col}:")
        print(imputed_rows[[col, f'{col}_is_imputed']].head())


Remaining missing values after imputation:
Reference Price (Yuan)                         0
Brand                                          0
Model                                          0
Production Method                              0
Origin                                         0
Engine                                         0
Max Horsepower (Ps)                            0
Max Power/Speed (kW/rpm)                       0
Transmission                                   0
ABS                                            0
CBS                                            0
Dimensions (LxWxH mm)                          0
Seat Height (mm)                               0
Curb Weight (kg)                               0
Fuel Tank Capacity (L)                         0
Max Speed (km/h)                               0
Official Average Fuel Consumption (L/100km)    0
Range (km)                                     0
Available Colors                               0
Max Horsepower (Ps)_is_im

In [100]:
df_imputed.head()

Unnamed: 0,Reference Price (Yuan),Brand,Model,Production Method,Origin,Engine,Max Horsepower (Ps),Max Power/Speed (kW/rpm),Transmission,ABS,...,Fuel Tank Capacity (L),Max Speed (km/h),Official Average Fuel Consumption (L/100km),Range (km),Available Colors,Max Horsepower (Ps)_is_imputed,Seat Height (mm)_is_imputed,Curb Weight (kg)_is_imputed,Fuel Tank Capacity (L)_is_imputed,Max Speed (km/h)_is_imputed
1,Currently no quotation available,titanium,Pedal,Not Specified,Not Specified,single cylinder four stroke water-cooled 150cc,15.8,11.6/8500,Not Specified,comes,...,8.0,105.0,-,-,Not Specified,False,False,False,False,False
2,Currently no quotation available,titanium,Pedal,Not Specified,Not Specified,single cylinder four stroke water-cooled 150cc,15.8,11.6/8500,CVT continuously variable transmission automat...,standard,...,8.0,100.0,-,-,Titanium Extreme Silver,False,False,False,False,False
3,Currently no quotation available,titanium,Pedal,Not Specified,Not Specified,single cylinder four stroke water-cooled 300cc,25.2,18.5/8250,CVT continuously variable transmission automat...,standard,...,11.0,118.0,-,-,Dark Night Black,False,False,False,False,False
4,Currently no quotation available,titanium,Street car,Not Specified,Not Specified,inline dual cylinder water-cooled 250cc,32.6,24/9500,-,standard,...,13.0,120.0,-,-,Titanium Extreme Silver,False,False,False,False,False
5,29900,Chaite,Sports car,Not Specified,Jiangmen Changhua Group Co,inline dual cylinder four stroke water-cooled ...,53.8,39.6/8500,international 6-speed,standard,...,15.0,195.0,3.6,400,"Ice White, Track Blue",False,False,False,False,False


In [101]:
# Save the preprocessed dataframe to checkpoints folder
df_imputed.to_csv('../data/checkpoints/imputed_full_motorcycle_data.csv', index=False)
print("Preprocessed data saved to checkpoints/preprocessed_data.csv")


Preprocessed data saved to checkpoints/preprocessed_data.csv


In [102]:
# Load the brochure motorcycle data
df_brochure = pd.read_csv('../data/formatted/brochure_motorcycle.csv')

# Drop columns with all missing values
df_brochure_cleaned = df_brochure.dropna(axis=1, how='all')

# Show info about dropped columns
dropped_cols = set(df_brochure.columns) - set(df_brochure_cleaned.columns)
print("\nColumns dropped due to all missing values:")
print(dropped_cols)

# Save the cleaned brochure dataframe
df_brochure_cleaned.to_csv('../data/checkpoints/full_brochure_motorcycle_data.csv', index=False)
print("\nCleaned brochure data saved to checkpoints/cleaned_brochure_motorcycle_data.csv")




Columns dropped due to all missing values:
{'Unnamed: 42', 'Unnamed: 54', 'Tire Size(Front/Rear)', 'Container loading （CKD', 'Consumption (L/100km)', 'Unnamed: 34', 'Unnamed: 56', '(mm)', 'Wheel rim Size(Front/Rear)', 'Unnamed: 71', 'Unnamed: 82', 'Unnamed: 52', 'Unnamed: 87', 'Unnamed: 64', 'Unnamed: 45', 'Unnamed: 40', 'Unnamed: 36', 'Unnamed: 85', 'Unnamed: 49', 'parts）(PCS)', 'Unnamed: 26', 'Unnamed: 12', 'Minimum ground clearance', 'Unnamed: 62', 'Unnamed: 6', 'Container loading （SKD', 'Unnamed: 38', 'Unnamed: 19', 'Unnamed: 10', 'Unnamed: 28', 'Unnamed: 58', 'Unnamed: 76', 'Unnamed: 78', 'Unnamed: 60', 'Unnamed: 83', 'Unnamed: 75', 'Unnamed: 17', 'Unnamed: 24', 'Unnamed: 32', 'Unnamed: 4', 'Unnamed: 15', 'Unnamed: 47', 'Unnamed: 2', 'Unnamed: 30', 'Unnamed: 68', 'Unnamed: 66', 'Unnamed: 8', 'Economical Fuel', 'Unnamed: 73', 'Unnamed: 22', 'Unnamed: 80', 'packaging）(SET)'}

Cleaned brochure data saved to checkpoints/cleaned_brochure_motorcycle_data.csv
