In [4]:
#Step 1: Loading and Inspecting the Dataset
#We will begin by understanding the structure of the suspicious rows dataset.

import pandas as pd

# Loading the suspicious combined rows dataset
suspicious_rows_path = r'D:\Salman work folder\ImumAi_Data_Analyst_Test\suspicious_rows_combined.csv'  # Update with your file path
suspicious_rows = pd.read_csv(suspicious_rows_path)

# Inspect the dataset
print("Dataset Shape:", suspicious_rows.shape)
print("\nDataset Info:")
print(suspicious_rows.info())
print("\nFirst Few Rows of the Dataset:")
print(suspicious_rows.head())


Dataset Shape: (19582, 22)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19582 entries, 0 to 19581
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   source_id              19582 non-null  int64  
 1   title                  19582 non-null  object 
 2   category               18708 non-null  object 
 3   make                   19551 non-null  object 
 4   model                  19582 non-null  object 
 5   power                  19582 non-null  int64  
 6   color                  17298 non-null  object 
 7   capacity               19582 non-null  int64  
 8   transmission           19574 non-null  object 
 9   fuel                   19457 non-null  object 
 10  construction_year      19582 non-null  object 
 11  price_net              7306 non-null   float64
 12  price_gross            19582 non-null  int64  
 13  added                  19582 non-null  object 
 14  mileage     

In [5]:
#Step 2: Checking for Missing Values
#Identify which columns have missing or invalid data in the suspicious rows dataset.

# Checking for missing values
print("\nMissing Values in Suspicious Rows Dataset:")
print(suspicious_rows.isnull().sum())

# Percentage of missing values
print("\nPercentage of Missing Values:")
missing_percentage = suspicious_rows.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))



Missing Values in Suspicious Rows Dataset:
source_id                    0
title                        0
category                   874
make                        31
model                        0
power                        0
color                     2284
capacity                     0
transmission                 8
fuel                       125
construction_year            0
price_net                12276
price_gross                  0
added                        0
mileage                      0
url                          0
extracted_engine_type    18441
engine_type_validated    18441
corrected_category         874
engine_type              18441
engine_type_missing          0
category_missing             0
dtype: int64

Percentage of Missing Values:
engine_type              94.173220
engine_type_validated    94.173220
extracted_engine_type    94.173220
price_net                62.690226
color                    11.663773
category                  4.463283
corrected_category  

In [6]:
#Step 3: Analyzing Problematic Fields
#Engine Type:
#Checking for rows with missing or invalid engine types.
#Assessing patterns based on the title and capacity.


print("\nRows with Missing Engine Type:")
missing_engine_type = suspicious_rows[suspicious_rows['engine_type'].isnull()]
print(missing_engine_type[['title', 'capacity', 'engine_type']].head())

print("\nUnique Engine Types:")
print(suspicious_rows['engine_type'].unique())



Rows with Missing Engine Type:
                                               title  capacity engine_type
0  Volkswagen Golf -Black Weeks-Fahrzeug, täglich...      1395         NaN
1  Volkswagen Golf VII Sportsvan Comfortline 1.6T...      1598         NaN
2  Volkswagen Golf VII DSG HIGHLINE SPORT LED*NAV...      1968         NaN
3  Volkswagen Golf Bluetooth Navi Klima Einparkhilfe      1395         NaN
4  Volkswagen Golf VII Sportsvan DSG KLIMA Navi p...      1197         NaN

Unique Engine Types:
[nan 'TDI' 'TSI' 'R' 'GTI' 'GTD' 'eTSI' 'e-Golf']


In [7]:
#Category:
#Reviewing rows with missing or mismatched category.
#Using title as a clue to infer categories.


print("\nRows with Missing Category:")
missing_category = suspicious_rows[suspicious_rows['category'].isnull()]
print(missing_category[['title', 'category']].head())

print("\nUnique Categories:")
print(suspicious_rows['category'].unique())



Rows with Missing Category:
                                                title category
13  Volkswagen Golf Plus 1.6 Euro 5 LIFE nur 39.65...      NaN
35  Volkswagen Golf VII Variant Lounge BMT,NAVI,SH...      NaN
36  Volkswagen Volkswagen Golf 1.4 TGI Business Bl...      NaN
37  Volkswagen Golf VII Sportsvan 1.2 Allstar NAVI...      NaN
38  Volkswagen Golf Sports.VII Sound BMT AHK StHz....      NaN

Unique Categories:
['Small Car' 'Cabriolet/Roadster' nan]


In [8]:
#Transmission:
#Looking for contradictions in transmission (e.g., "Manual gearbox" vs. "DSG").
#Assessing how often these contradictions occur.

# Contradictions in Transmission
print("\nContradictions in Transmission (e.g., DSG in title, Manual gearbox in data):")
contradictory_transmission = suspicious_rows[
    (suspicious_rows['transmission'].str.contains('Manual', na=False)) &
    (suspicious_rows['title'].str.contains('DSG', na=False))
]
print(contradictory_transmission[['title', 'transmission']].head())

print("\nUnique Transmission Types:")
print(suspicious_rows['transmission'].unique())



Contradictions in Transmission (e.g., DSG in title, Manual gearbox in data):
                                                  title    transmission
2340  Volkswagen Golf GTE 1.4 DSG AHK+Navi+BT+Digita...  Manual gearbox
2365  Volkswagen Golf GTE 1.4 DSG AHK+Navi+BT+Digita...  Manual gearbox
6448  Volkswagen VOLKSWAGEN GOLF 7 VARIANT 1.6TDI DS...  Manual gearbox
6688  Volkswagen VOLKSWAGEN GOLF 7 VARIANT 1.6TDI DS...  Manual gearbox
7041  Volkswagen VOLKSWAGEN GOLF 7 VARIANT 1.6TDI DS...  Manual gearbox

Unique Transmission Types:
['Manual gearbox' 'Automatic transmission' 'automatic' nan
 'Semi-automatic' 'Automatic']


In [9]:
#Fuel:
#Identifying contradictions between fuel and title (e.g., Petrol vs. Hybrid in title).
#Counting the frequency of each type of contradiction.

# Contradictions in Fuel
print("\nContradictions in Fuel (e.g., Hybrid in title, Petrol in data):")
contradictory_fuel = suspicious_rows[
    (suspicious_rows['fuel'].str.contains('Petrol', na=False)) &
    (suspicious_rows['title'].str.contains('Hybrid', na=False))
]
print(contradictory_fuel[['title', 'fuel']].head())

print("\nUnique Fuel Types:")
print(suspicious_rows['fuel'].unique())



Contradictions in Fuel (e.g., Hybrid in title, Petrol in data):
                                                  title                 fuel
679   Volkswagen Golf VIII Lim. Style eHybrid DSG~LE...               Petrol
926                Volkswagen Golf VIII 1.4 eHybrid GTE               Petrol
1641  Volkswagen Golf VIII 1,4TSI eHybrid GTE DSG Na...               Petrol
1811  Volkswagen Golf VIII 1.4 GTE eHybrid NAV / LED...  Petrol, E10-enabled
2233  Volkswagen Golf VIII eHybrid GTE DSG NAVI ACC ...  Petrol, E10-enabled

Unique Fuel Types:
['Petrol' 'Diesel' 'Petrol, E10-enabled' 'Natural Gas'
 'Hybrid (petrol/electric), E10-enabled, Plug-in hybrid' 'Other'
 'Diesel, Biodiesel Suitable' 'Hybrid (petrol/electric), Plug-in hybrid'
 nan 'Hybrid (petrol/electric)' 'Natural Gas, E10-enabled' 'Electric'
 'Hybrid (petrol/electric), E10-enabled' 'Diesel, E10-enabled' 'LPG'
 'Diesel, Biodiesel Suitable, Suitable for Vegetable Oil'
 'Hybrid (diesel/electric)']


In [10]:
#Power:
#Analyzing missing or invalid power values.
#Use title to infer potential power values.

# Missing Power Values
print("\nRows with Missing Power:")
missing_power = suspicious_rows[suspicious_rows['power'] == 0]
print(missing_power[['title', 'power']].head())

# Invalid Power Values
print("\nRows with Invalid Power Values:")
valid_power_range = range(50, 400)  # Adjust based on domain knowledge
invalid_power = suspicious_rows[~suspicious_rows['power'].isin(valid_power_range)]
print(invalid_power[['title', 'power']].head())

print("\nPower Value Distribution:")
print(suspicious_rows['power'].describe())




Rows with Missing Power:
                                                  title  power
661      Volkswagen Golf VIII 1.4TSI DSG Style +Plug in      0
1616  Volkswagen eGolf-+LED Licht+ACC+RDC+PDC v+h+bl...      0
3436  Volkswagen VOLKSWAGEN Golf 5p 1.4 tgi trendlin...      0
4724    Volkswagen VOLKSWAGEN Golf 1.4 GTE dsg del 2021      0
5141  Volkswagen Golf VII Variant 1,4 Cup~92KW~Navi~...      0

Rows with Invalid Power Values:
                                                 title  power
206  Volkswagen Golf VIII 1,4  GTE eHybrid/IQ/PRO/A...     25
661     Volkswagen Golf VIII 1.4TSI DSG Style +Plug in      0
827  Volkswagen Golf VIII 1,4  GTE eHybrid/LED-PLUS...     25
849  Volkswagen Golf GTE VII 1.4 DSG  Navi LED PDC ...     16
885  Volkswagen Golf GTE VIII 1.4 eHybrid DSG Black...     20

Power Value Distribution:
count    19582.000000
mean       131.843632
std         53.931633
min          0.000000
25%        109.000000
50%        121.000000
75%        148.000000
max       