In [3]:
import pandas as pd 
vehicle_df = pd.read_csv("rows_drop_1_vehicles.csv")
print(vehicle_df)

#sort the vehicles dataset according to its index column
vehicles_df = vehicle_df.sort_values(by = 'Index')
print(vehicles_df)

#Outlier detection in Vehicle model year
#Here we consider that vehicles made before the year 1960 are out of date and not fit to use and vehicle with the manufacturing year over 2025 are considered as outliers
year_outliers = vehicle_df[(vehicle_df['Vehicle Model Year'] < 1960) | (vehicle_df['Vehicle Model Year'] > 2025)]
len(year_outliers)

#removal of vehicle Model Year outliers 
print(year_outliers.index)
vehicle_df = vehicle_df.drop(year_outliers.index)
print(vehicle_df)

print(vehicle_df.columns)
len(vehicle_df.columns)

       Index  Public Vehicle Number     Status Vehicle Make Vehicle Model  \
0       1286                  12009   RESERVED    CHEVROLET       EXPRESS   
1       2095                  12248   INACTIVE     MERCEDES      SPRINTER   
2       7950                  13527   INACTIVE     VAN HOOL         TD925   
3       9359                  13528   INACTIVE     VAN HOOL         TD925   
4       9441                  12025   INACTIVE     MERCEDES      SPRINTER   
...      ...                    ...        ...          ...           ...   
12649  16576                    998  VIOLATION       TOYOTA         CAMRY   
12650  16583                   3810     ACTIVE       TOYOTA          RAV4   
12651  16590                   4690  VIOLATION       TOYOTA         CAMRY   
12652  16592                   2831     ACTIVE       TOYOTA         PRIUS   
12653  16596                   1509     ACTIVE       TOYOTA        SIENNA   

       Vehicle Model Year Vehicle Color Vehicle Fuel Source  \
0           

17

In [4]:
#All the variables are categorical variables. Hence a search for possible outliers in the qualitative variables is cariied out. 

qualitative_variables = vehicle_df.select_dtypes(include=['object', 'category', 'bool'])
print(qualitative_variables)

#Detecting outliers for Status variable
unique_status = vehicle_df['Status'].unique()
print("Unique values in 'Status':", unique_status)


#Detecting outliers for Vehicle Make variable
unique_make = vehicle_df['Vehicle Make'].unique()
print("Unique values in 'Vehicle Make':", unique_make)
sort_names = {'CHEVROLET', 'MERCEDES', 'VAN HOOL' ,'VPG', 'DODGE' ,'TOYOTA' ,'FORD',
 'CADILLAC', 'FREIGHLINE' ,'CHAMPION' ,'THOMAS', 'FREIGHTLINER', 'MCI',
 'ALEXANDER DENNIS LTD' ,'GILLIG' ,'INTL', 'LEYLAND' ,'ALEXDER DENNIS LTD',
 'IC', 'DOUBLE SHUFLE' ,'ISUZU' ,'SPEC-CONSTED', 'TESLA' ,'KIA', 'TUK TUK',
 'MAINSTREET', 'LUCID' ,'POLARIS' ,'BMW', 'PRECISION' ,'CHRYSLER', 'MERCURY',
 'LINCOLN' ,'GMC', 'ALEXANDER', 'VOLVO', 'HONDA' ,'HYUNDAI', 'NISSAN' ,'AUDI',
 'LEXUS' ,'INFINITI' ,'JEEP' ,'MOVITRON', 'ORION', 'ANKAI', 'MASERATI', 'MAZDA',
 'CHARLESTON' ,'NEOPLAN', 'GENISIS' ,'OSHKOSH', 'BLUEBIRD', 'VOLKSWAGEN',
 'BOYERTOWN', 'MITSUBISHI', 'MOBILITY VENTURE', 'LAND ROVER', 'TURTLE TOP',
 'SUBARU', 'ACURA' ,'SCION', 'Dodge' ,'Ford', 'PONTIAC', 'BMX' ,'BARTH' ,'JAGUAR',
 'CINDERELLA', 'TROYER' ,'KIMBALL' ,'YODER', 'MARTIN', 'SCHWARTZ' ,'MILLER',
 'ANDERSON', 'SCHROCK' ,'STUDEBAKER'}
sorted_list = sorted(sort_names)
print(sorted_list)

#Few misspelled variables were identified in vehicle make. Hence they were corrected
corrections = {
    'FORD': 'Ford',
    'ALEXANDER DENNIS LTD': 'ALEXDER DENNIS LTD',
    'FREIGHLINE': 'FREIGHTLINER',
    'DODGE': 'Dodge'
}

# Correct the misspelled entries in the 'Vehicle Make' column
vehicle_df['Vehicle Make'] = vehicle_df['Vehicle Make'].replace(corrections)
print(vehicle_df['Vehicle Make'].unique())

#Detecting outliers for Vehicle Model
unique_model = vehicle_df['Vehicle Model'].unique()
print("Unique values in 'Vehicle Make':", unique_model)
#No misspelled, incorrect or inconsistent values were identified.

#Detecting outliers for Vehicle Fuel Source
unique_source = vehicle_df['Vehicle Fuel Source'].unique()
print("Unique values in 'Vehicle Fuel Source':", unique_source)
#No misspelled, incorrect or inconsistent values were identified.

#Detecting outliers for Wheelchair Accessible
unique_access = vehicle_df['Wheelchair Accessible'].unique()
print("Unique values in 'Wheelchair Accessible':", unique_access)
#No misspelled, incorrect or inconsistent values were identified. 

          Status Vehicle Make Vehicle Model Vehicle Color Vehicle Fuel Source  \
0       RESERVED    CHEVROLET       EXPRESS         BLACK          Bio-Diesel   
1       INACTIVE     MERCEDES      SPRINTER        SILVER          Bio-Diesel   
2       INACTIVE     VAN HOOL         TD925           RED          Bio-Diesel   
3       INACTIVE     VAN HOOL         TD925           RED          Bio-Diesel   
4       INACTIVE     MERCEDES      SPRINTER         BLACK          Bio-Diesel   
...          ...          ...           ...           ...                 ...   
12649  VIOLATION       TOYOTA         CAMRY         WHITE              Hybrid   
12650     ACTIVE       TOYOTA          RAV4    GRAY/WHITE              Hybrid   
12651  VIOLATION       TOYOTA         CAMRY         WHITE              Hybrid   
12652     ACTIVE       TOYOTA         PRIUS         WHITE              Hybrid   
12653     ACTIVE       TOYOTA        SIENNA         WHITE              Hybrid   

      Wheelchair Accessible

In [5]:
#Detecting outliers for City
unique_city = vehicle_df['City'].unique()
print("Unique values in 'City':", unique_city)
string_city = {'CHICAGO', 'ELMHURST', 'ORLAND PARK', 'ST. CHARLES', 'SKOKIE', 'FRANKFORT',
 'ELGIN', 'SHOREWOOD', 'TINLEY PARK', 'LYNWOOD', 'CHIICAGO', 'LOVES PARK',
 'BARRINGTON', 'NEW LENOX' ,'DOLTON', 'CHICAGO RIDGE', 'BARTLETT',
 'EVERGREEN PARK', 'ARLINGTON HEIGHTS', 'BROOKFIELD', 'EARLVILLE',
 'SCHILLER PARK' ,'CHGO', 'PLAINFIELD' ,'DEERFIELD', 'OAKBROOK' ,'NORTHBROOK',
 'BENSENVILLE', 'ELK GROVE VILLAGE', 'JOHNSBURG' ,'NORRIDGE' ,'DESPLAINES',
 'DES PLAINES', 'ITASCA' ,'CHCAGO', 'GURNEE' ,'METTAWA'}
sorted_city = sorted(string_city)
print(sorted_city)

#Few misspelled variables were identified in the cities. Hence they were corrected
corrections2 = {
    'CHCAGO': 'CHICAGO',
    'CHGO': 'CHICAGO',
    'CHIICAGO': 'CHICAGO',
    'DES PLAINES': 'DESPLAINES'
    
}

# Correct the misspelled entries in the 'City' column
vehicle_df['City'] = vehicle_df['City'].replace(corrections2)
print(vehicle_df['City'].unique())

#Detecting outliers for Color
#unique_color = vehicle_df['Vehicle Color'].unique()
#print("Unique values in 'Vehicle Color':", unique_color)
string_color = {'BLACK' ,'SILVER', 'RED' ,'WHITE' 'GREEN', 'BLUE/WHITE', 'BURGUNDY', 'ORANGE',
 'BLACK/WHITE/RED/BLUE', 'MAROON' ,'RED/CREAM' ,'WHITE/RED', 'YELLOW/BLUE',
 'RED/BLUE', 'RED/WHITE', 'RED/GREEN', 'WHITE/BLUE', 'RED/WHITE/BLUE',
 'WHITE/ORANGE', 'DARK GREEN', 'YELLOW', 'WHITE/BLACK/STRIPES', 'BLUE',
 'CREAM/GREEN', 'SILVER/BLACK', 'WHITE/BLUE/STRIPES', 'BLACK/WHITE',
 'WHITE/STARS/STRIPES', 'RED/BLUE/WHITE' ,'WHITE/RED/BLACK', 'GOLD',
 'PINK/WHITE', 'CREAM/BLUE', 'WHITE/RED STRIPES', 'BLACK/GREEN',
 'WHITE/BLUE/RED', 'GRAY', 'WHITE/RED/BLUE' ,'LAVENDER', 'BROWN' ,'WHITE/BLACK',
 'DARK GRAY', 'RED/WHITE/CHECKER', 'SILVER/GRAY' ,'CREAM/BLACK/RED STRIPES',
 'WHITE/BLUE LETTERS', 'WHITE/SILVER', 'WHITE/YELLOW', 'BLACK/RED', 'BEIGE',
 'ORANGE/WHITE', 'GRAY/BLACK', 'PURPLE' ,'TAN/RED' ,'GRAY/BLACK/WHITE',
 'WHITE/ORANGE STRIPE/BLUE LETTERS', 'ORANGE/BLUE', 'TAN/RED/BLACK','GRAY/WHITE', 'WHITE/RED/STRIPES' ,'TAN', 'BLUE/WHITE/BLACK',
 'WHITE/BLACK/GOLD', 'DARK BLUE' ,'WHITE/PURPLE' ,'BLUE/RED' ,'WHITE/GRAY',
 'NONE', 'BLACK/RED/WHITE/BLUE', 'TURQUOISE', 'GREEN/WHITE/BLACK',
 'WHITE/SILVER/BLUE STRIPES', 'RED/BLACK/YELLOW STRIPE', 'NAVY BLUE',
 'WHITE/GREEN', 'BLACK/WHITE/STRIPES', 'WHITE/BLACK/RED', 'RED/BLACK' ,'SAGE',
 'CREAM' ,'YELLOW/BLACK'}
sorted_color = sorted(string_color)
print(sorted_color)

#No misspelled, incorrect or inconsistent values were identified. 

Unique values in 'City': ['CHICAGO' 'ELMHURST' 'ORLAND PARK' 'ST. CHARLES' 'SKOKIE' 'FRANKFORT'
 'ELGIN' 'SHOREWOOD' 'TINLEY PARK' 'LYNWOOD' 'CHIICAGO' 'LOVES PARK'
 'BARRINGTON' 'NEW LENOX' 'DOLTON' 'CHICAGO RIDGE' 'BARTLETT'
 'EVERGREEN PARK' 'ARLINGTON HEIGHTS' 'BROOKFIELD' 'EARLVILLE'
 'SCHILLER PARK' 'CHGO' 'PLAINFIELD' 'DEERFIELD' 'OAKBROOK' 'NORTHBROOK'
 'BENSENVILLE' 'ELK GROVE VILLAGE' 'JOHNSBURG' 'NORRIDGE' 'DESPLAINES'
 'DES PLAINES' 'ITASCA' 'CHCAGO' 'GURNEE' 'METTAWA']
['ARLINGTON HEIGHTS', 'BARRINGTON', 'BARTLETT', 'BENSENVILLE', 'BROOKFIELD', 'CHCAGO', 'CHGO', 'CHICAGO', 'CHICAGO RIDGE', 'CHIICAGO', 'DEERFIELD', 'DES PLAINES', 'DESPLAINES', 'DOLTON', 'EARLVILLE', 'ELGIN', 'ELK GROVE VILLAGE', 'ELMHURST', 'EVERGREEN PARK', 'FRANKFORT', 'GURNEE', 'ITASCA', 'JOHNSBURG', 'LOVES PARK', 'LYNWOOD', 'METTAWA', 'NEW LENOX', 'NORRIDGE', 'NORTHBROOK', 'OAKBROOK', 'ORLAND PARK', 'PLAINFIELD', 'SCHILLER PARK', 'SHOREWOOD', 'SKOKIE', 'ST. CHARLES', 'TINLEY PARK']
['CHICAGO' 'ELMHURST'