In [5]:

import pandas as pd

#File path 
prepared_dataset_path = r'D:\Salman work folder\ImumAi_Data_Analyst_Test\cleaned_car_golf_dataset.csv'
car_golf_prepared = pd.read_csv(prepared_dataset_path)

# Previewing the dataset
print("Prepared Dataset Loaded:")
print(car_golf_prepared.head())


Prepared Dataset Loaded:
   source_id                                              title  \
0  286354023  Volkswagen Golf -Black Weeks-Fahrzeug, täglich...   
1  312826845  Volkswagen Golf Var. VII 2.0 TDI DSG GTD Sport...   
2  311592214  Volkswagen Golf VII Sportsvan Van/Lounge 2.0 T...   
3  312569111  Volkswagen Golf  Sportsvan 1.2 TSI Lounge AAC ...   
4  299066532  Volkswagen Golf 7 Sportsvan Allstar 1.6 TDI BM...   

             category        make model  power color  capacity  \
0           Small Car  volkswagen  golf    123   NaN      1395   
1  Cabriolet/Roadster  volkswagen  golf    181   NaN      1968   
2  Cabriolet/Roadster  volkswagen  golf    148   NaN      1968   
3           Small Car  volkswagen  golf    109   NaN      1197   
4           Small Car  volkswagen  golf    109   NaN      1598   

             transmission    fuel  ... price_gross  \
0          Manual gearbox  Petrol  ...       12140   
1  Automatic transmission  Diesel  ...       22950   
2  Automatic 

In [6]:
#Identifying Suspicious Rows

#Here are logical checks based on our data understanding:

#Missing or Incorrect Engine Type
#Rows where engine_type is missing or invalid (flagged during data preparation).

# Detecting rows with missing engine type
suspicious_engine_type = car_golf_prepared[car_golf_prepared['engine_type'].isnull()]

print("\nSuspicious Engine Type Rows Detected:")
print(suspicious_engine_type[['title', 'engine_type']])



Suspicious Engine Type Rows Detected:
                                                   title engine_type
0      Volkswagen Golf -Black Weeks-Fahrzeug, täglich...         NaN
5      Volkswagen Golf VII Sportsvan Comfortline 1.6T...         NaN
13     Volkswagen Golf VII DSG HIGHLINE SPORT LED*NAV...         NaN
20     Volkswagen Golf Bluetooth Navi Klima Einparkhilfe         NaN
23     Volkswagen Golf VII Sportsvan DSG KLIMA Navi p...         NaN
...                                                  ...         ...
77721  Volkswagen Golf Ko. 1.5TSI IQ DRIVE Navi/ Sitz...         NaN
77723  Volkswagen GOLF VII VARIANT COMFORTLINE 150PS ...         NaN
77725  Volkswagen Golf VII 2,0TDI Variant Highline BM...         NaN
77731  Volkswagen Golf VII Variant Highline Navi AHK ...         NaN
77738  Volkswagen Golf VIII Active LED Navi ACC  Fern...         NaN

[18447 rows x 2 columns]


In [7]:
#Missing or Incorrect Category
#Rows where category is missing (flagged during data preparation).

# Detecting rows with missing category
suspicious_category = car_golf_prepared[car_golf_prepared['category'].isnull()]

print("\nSuspicious Category Rows Detected:")
print(suspicious_category[['title', 'category']])




Suspicious Category Rows Detected:
                                                   title category
144    Volkswagen Golf VIII Variant Style 2.0 TDI *Bu...      NaN
168    Volkswagen Golf Plus 1.6 Euro 5 LIFE nur 39.65...      NaN
191    Volkswagen Golf Var.1.4 TSI Allstar NAVI PDC G...      NaN
334    Volkswagen Golf VII Sportsvan Highline 1,4 TSI...      NaN
335    Volkswagen Golf 7 VII Sportsvan 2.0 TDI Highli...      NaN
...                                                  ...      ...
77126                 Volkswagen VOLKSWAGEN Golf 2.0 TDI      NaN
77350  Volkswagen VOLKSWAGEN Golf 1.4 TSI 140 CV DSG ...      NaN
77517   Volkswagen Golf GTI 7.5 GTI DSG TCR ACC - VIRTUA      NaN
77621              Volkswagen Golf 2.0 TDI CUP *NAVI*PDC      NaN
77727  Volkswagen Golf VIII Move 2.0 TDI DSG Bluetoot...      NaN

[874 rows x 2 columns]


In [8]:
#Contradictions in Transmission
#Rows where transmission contradicts title (e.g., Manual gearbox vs. "DSG" in title).

# Detecting transmission contradictions
suspicious_transmission = car_golf_prepared[
    (car_golf_prepared['transmission'].str.contains("Manual", na=False)) &
    (car_golf_prepared['title'].str.contains("DSG", na=False))
]

print("\nSuspicious Transmission Rows Detected:")
print(suspicious_transmission[['title', 'transmission']])



Suspicious Transmission Rows Detected:
                                                   title    transmission
836    Volkswagen Golf 2.0 TDI DSG Klimatronic,Leder,...  Manual gearbox
1196   Volkswagen Golf VII 1,2 TSI DSG Allstar (PDCpl...  Manual gearbox
1260   Volkswagen Golf Variant VIII Life 2.0 TDI DSG ...  Manual gearbox
2795   Volkswagen Golf Variant 2.0l TDI DSG PDC LED N...  Manual gearbox
3306   Volkswagen Golf VIII 2.0 TSI DSG R 4Motion NAV...  Manual gearbox
...                                                  ...             ...
72080  Volkswagen Golf VIII EDITION 50 1.5 eTSI DSG K...  Manual gearbox
76294  Volkswagen VOLKSWAGEN - Golf - 2.0 TSI DSG 5p....  Manual gearbox
76739  Volkswagen Golf 1.5 eTSI Style DSG LED NAVI AH...  Manual gearbox
77509    Volkswagen Golf 1.4 TGI DSG 5p. Business 4 Free  Manual gearbox
77531  Volkswagen VOLKSWAGEN Golf Variant 1.6 TDI 115...  Manual gearbox

[130 rows x 2 columns]


In [9]:
 #Contradictions in Fuel
#Rows where fuel contradicts title (e.g., Petrol vs. "Hybrid" in title).

# Detecting fuel contradictions
suspicious_fuel = car_golf_prepared[
    (car_golf_prepared['fuel'].str.contains("Petrol", na=False)) &
    (car_golf_prepared['title'].str.contains("Hybrid", na=False))
]

print("\nSuspicious Fuel Rows Detected:")
print(suspicious_fuel[['title', 'fuel']])



Suspicious Fuel Rows Detected:
                                                   title                 fuel
2487   Volkswagen Golf Variant Life 1.5 eTSI (Mild-Hy...  Petrol, E10-enabled
2626   Volkswagen Golf VIII Lim. Style eHybrid DSG~LE...               Petrol
2915   Volkswagen Golf Variant 1.5 eTSI STYLE MildHyb...               Petrol
3703                Volkswagen Golf VIII 1.4 eHybrid GTE               Petrol
3871   Volkswagen Golf VIII 1.5 eTSI R-Line BlackStyl...               Petrol
...                                                  ...                  ...
68069  Volkswagen Golf VIII 1.5 eTSI Style Hybrid MAT...               Petrol
70373  Volkswagen Golf GTE 1,5 l eHybrid DSG HUD LED ...               Petrol
74818  Volkswagen Golf  GTE eHybrid Pan/Kam/Keyl/HUD/...               Petrol
76903  Volkswagen Golf Style eHybrid IQ-Matrix/Kamera...               Petrol
77337  Volkswagen Golf GTE eHybrid DSG AHK DCC Navi A...  Petrol, E10-enabled

[64 rows x 2 columns]


In [11]:
#Invalid or Missing Power
#Rows where power is missing but mentioned in title.
#Rows where power does not match valid ranges.

import re  # Import the regular expressions library

# Detecting missing power but mentioned in title
suspicious_missing_power = car_golf_prepared[
    (car_golf_prepared['power'] == 0) & 
    (car_golf_prepared['title'].str.contains(r'\d+\s?PS', na=False, flags=re.IGNORECASE))
]

print("\nRows with Missing Power but Mentioned in Title:")
print(suspicious_missing_power[['title', 'power']])

# Define valid power ranges (adjust as needed)
valid_power_range = range(50, 400)

# Detect invalid power values
suspicious_invalid_power = car_golf_prepared[~car_golf_prepared['power'].isin(valid_power_range)]

print("\nRows with Invalid Power Values:")
print(suspicious_invalid_power[['title', 'power']])




Rows with Missing Power but Mentioned in Title:
                                                   title  power
13841  Volkswagen Golf GTI 2,0 l TSI OPF 195 kW (265 ...      0

Rows with Invalid Power Values:
                                                   title  power
719    Volkswagen VOLKSWAGEN Golf 5p 1.6 tdi highline...      0
912    Volkswagen VOLKSWAGEN Golf sportsvan 2.0 tdi h...      0
1021   Volkswagen Golf VIII 1,4  GTE eHybrid/IQ/PRO/A...     25
2534   Volkswagen e-Golf 7.5 100 Kw LED/ECC/ACC/NAVI/...      0
2562      Volkswagen Golf VIII 1.4TSI DSG Style +Plug in      0
...                                                  ...    ...
75346  Volkswagen Golf VII Lim R BMT 4Motion/Komplett...    503
76460  Volkswagen Golf VII R 4M. 409 PS MEGA AUSSTA.*...    404
77428  Volkswagen VOLKSWAGEN Golf 1.0 tsi evo life 11...      0
77557  Volkswagen VOLKSWAGEN Golf 5p 1.5 tgi business...      0
77558           Volkswagen VOLKSWAGEN E-golf 5p del 2019      0

[463 rows x 2 columns

In [12]:
#Combining Suspicious Rows
#We will combine all identified suspicious rows into a single file for manual review.

# Combining all suspicious rows
suspicious_rows = pd.concat([
    suspicious_engine_type,
    suspicious_category,
    suspicious_transmission,
    suspicious_fuel,
    suspicious_missing_power,
    suspicious_invalid_power
]).drop_duplicates()

print(f"\nTotal Suspicious Rows Detected: {len(suspicious_rows)}")

# Save suspicious rows to a file
suspicious_rows.to_csv(r'D:\Salman work folder\ImamAi\suspicious_rows_combined.csv', index=False)
print("\nSuspicious rows have been saved to 'suspicious_rows_combined.csv'")




Total Suspicious Rows Detected: 19582

Suspicious rows have been saved to 'suspicious_rows_combined.csv'


In [1]:
import pandas as pd

# Load the suspicious combined rows dataset
suspicious_rows_path = r'D:\Salman work folder\ImamAi\suspicious_rows_combined.csv'  # Update with your file path
suspicious_rows = pd.read_csv(suspicious_rows_path)

In [2]:
##Step 1: Cleaning engine_type
#Logic:
#Infer engine_type from title using keywords (TDI, TSI, eTSI, etc.).
#Use capacity as a secondary criterion when no clear engine type is mentioned.
# Function to infer engine type
def infer_engine_type(row):
    if pd.isnull(row['engine_type']):
        if 'TDI' in row['title']:
            return 'TDI'
        elif 'TSI' in row['title']:
            return 'TSI'
        elif 'eHybrid' in row['title'] or 'eTSI' in row['title']:
            return 'eTSI'
        elif 'GTD' in row['title']:
            return 'GTD'
        elif 'GTI' in row['title']:
            return 'GTI'
        elif 'R' in row['title']:
            return 'R'
        elif 'e-Golf' in row['title']:
            return 'e-Golf'
    return row['engine_type']

# Applying the function to infer missing engine types
suspicious_rows['engine_type'] = suspicious_rows.apply(infer_engine_type, axis=1)



In [3]:
##Step 2: Cleaning category
#Logic:Infer category from title using keywords (Variant, Sportsvan, etc.).Default to Small Car if no other category is evident.

# Function to infer category
def infer_category(row):
    if pd.isnull(row['category']):
        if 'Sportsvan' in row['title']:
            return 'Cabriolet/Roadster'
        elif 'Variant' in row['title']:
            return 'Small Car'
    return row['category']

# ApplyIing the function to infer missing categories
suspicious_rows['category'] = suspicious_rows.apply(infer_category, axis=1)


In [4]:
#Step 3: Resolving Contradictions in transmission
#Logic:
#If title mentions DSG, set transmission to Automatic transmission.

# Function to resolve transmission contradictions
def resolve_transmission(row):
    if 'DSG' in row['title'] and 'Manual' in str(row['transmission']):
        return 'Automatic transmission'
    return row['transmission']

# Applying the function to correct transmission contradictions
suspicious_rows['transmission'] = suspicious_rows.apply(resolve_transmission, axis=1)


In [5]:
#Step 4: Resolving Contradictions in fuel
#Logic:If title mentions Hybrid, set fuel to Hybrid (petrol/electric).

# Function to resolving fuel contradictions
def resolve_fuel(row):
    if 'Hybrid' in row['title'] and 'Petrol' in str(row['fuel']):
        return 'Hybrid (petrol/electric)'
    return row['fuel']

# Apply the function to correct fuel contradictions
suspicious_rows['fuel'] = suspicious_rows.apply(resolve_fuel, axis=1)



In [6]:
#Step 5: Cleaning power
#Logic:
#Infer power from title if mentioned.
#Remove invalid power values outside the valid range (50–400 HP).

import re

# Function to infer power from title
def infer_power(row):
    if row['power'] == 0:
        match = re.search(r'(\d+)\s?PS', row['title'])
        if match:
            return int(match.group(1))
    return row['power']

# Applying the function to infer missing power values
suspicious_rows['power'] = suspicious_rows.apply(infer_power, axis=1)

# Removing rows with invalid power values
valid_power_range = range(50, 400)
suspicious_rows = suspicious_rows[suspicious_rows['power'].isin(valid_power_range)]


In [7]:
#Step 6: Saving Cleaned Suspicious Rows
#After applying the cleaning logic, save the cleaned dataset for merging with the main dataset.

# Save cleaned suspicious rows
cleaned_suspicious_rows_path = r'D:\Salman work folder\ImumAi_Data_Analyst_Test\cleaned_suspicious_rows.csv'  # Updated save path
suspicious_rows.to_csv(cleaned_suspicious_rows_path, index=False)

print(f"Cleaned suspicious rows saved to {cleaned_suspicious_rows_path}")



Cleaned suspicious rows saved to D:\Salman work folder\ImamAi\cleaned_suspicious_rows.csv


In [8]:
import pandas as pd

# Path to the prepared dataset
prepared_dataset_path = r'D:\Salman work folder\ImamAi\cleaned_car_golf_dataset.csv'
car_golf_prepared = pd.read_csv(prepared_dataset_path)

# Path to the cleaned suspicious rows
cleaned_suspicious_rows_path = r'D:\Salman work folder\ImamAi\cleaned_suspicious_rows.csv'
suspicious_rows = pd.read_csv(cleaned_suspicious_rows_path)

# Updating the prepared dataset with cleaned suspicious rows
car_golf_prepared.update(suspicious_rows)

# Saving the final cleaned dataset
final_cleaned_dataset_path = r'D:\Salman work folder\ImumAi_Data_Analyst_Test\final_cleaned_car_golf_dataset.csv'
car_golf_prepared.to_csv(final_cleaned_dataset_path, index=False)

print(f"Final cleaned dataset saved to {final_cleaned_dataset_path}")


Final cleaned dataset saved to D:\Salman work folder\ImamAi\final_cleaned_car_golf_dataset.csv
