In [1]:
import pandas as pd
import numpy as np 

In [2]:
atm=pd.read_csv(r"C:\Users\CVR\Downloads\COE\Automobile.csv")

In [3]:
atm

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,number_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,168,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,168,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,168,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
197,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
198,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
199,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [4]:
atm.dtypes

symboling                int64
normalized_losses        int64
make                    object
fuel_type               object
aspiration              object
number_of_doors         object
body_style              object
drive_wheels            object
engine_location         object
wheel_base             float64
length                 float64
width                  float64
height                 float64
curb_weight              int64
engine_type             object
number_of_cylinders     object
engine_size              int64
fuel_system             object
bore                   float64
stroke                 float64
compression_ratio      float64
horsepower               int64
peak_rpm                 int64
city_mpg                 int64
highway_mpg              int64
price                    int64
dtype: object

**1)Handling missing values**

In [5]:
 #dentify missing values in normalized_losses, bore, stroke, horsepower, 
peak_rpm, price.
atm.isna()

SyntaxError: invalid syntax (3351219376.py, line 2)

In [None]:
atm.isna().sum()

In [None]:
#no missing values but if mising values then 
#filling null values with mean/median /mode
#horsepower-mean
#price-median
#number_of_doors,fuel_type -mode
atm['horsepower'].fillna(atm['horsepower'].mean(), inplace=True)


In [None]:
atm['price'].fillna(atm['price'].median(), inplace=True)


In [None]:
atm['number_of_doors'].fillna(atm['number_of_doors'].mode(), inplace=True)


In [None]:
atm['fuel_type'].fillna(atm['fuel_type'].mode(), inplace=True)

In [None]:
#drop records if too many missing lines 
#atm.dropna()

In [None]:
atm.dtypes

**2)string and categorical data preprocessing**

In [None]:
#Convert categorical data into lowercase and remove special characters 
cols = ['make', 'fuel_type', 'aspiration', 'number_of_doors', 'body_style',
                    'drive_wheels', 'engine_location', 'engine_type', 'number_of_cylinders',
                    'fuel_system']
for column in cols:
    atm[column] = atm[column].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)


In [None]:
atm

In [None]:
for column in cols:
    print(column)
    print(atm[column].unique())

In [None]:
#standardize number_of_cylinders:
#words to numbers
atm['number_of_cylinders']

In [None]:
nums={'four':4, 'six':6 ,'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}
atm['number_of_cylinders']=atm['number_of_cylinders'].replace(nums)
atm['number_of_cylinders']
#converting words to integers

In [None]:
#standardize fuel_system categories
fuel_system_cat={
'mpfi': 'Multi-Point Fuel Injection',
    'mfi': 'Multi-Point Fuel Injection',
    'spfi': 'Single-Point Fuel Injection',
    'spdi': 'Single-Point Fuel Injection',
    '2bbl': 'Two-Barrel Carburetor',
    '4bbl': 'Four-Barrel Carburetor',
    'idi': 'Indirect Fuel Injection',
    '1bbl': 'One-Barrel Carburetor'
}

# Standardize the 'fuel_system' column using the mapping dictionary
atm['fuel_system'] = atm['fuel_system'].replace(fuel_system_cat)

# Check the updated DataFrame
print(atm['fuel_system'].unique()) 


In [None]:
atm.dtypes

**3)Numeric Feature Procesing**

In [None]:
#convert "horsepower" & "peak_rpm" to ints

atm['horsepower']=atm['horsepower'].astype(np.int16)
atm['peak_rpm']=atm['peak_rpm'].astype(np.int16)

In [None]:
atm['horsepower'].dtypes

In [None]:
atm['peak_rpm'].dtypes

In [None]:
atm['horsepower']

In [None]:
#standardize units
atm['horsepower']=atm['horsepower']*0.7457
atm['horsepower']


**4)Feature Engineering on Numeric Columns**

In [None]:
#create new features
#Power-to-weight ratio = horsepower / curb_weight. 
atm['Power-to-weight ratio']=atm['horsepower']/atm['curb_weight']
atm['Power-to-weight ratio']

In [None]:
atm

In [None]:
#Engine efficiency = horsepower / engine_size.
atm['Engine efficiency']=atm['horsepower']/atm['engine_size']
atm['Engine efficiency']

In [None]:
#Fuel efficiency = city_mpg / highway_mpg. 
atm['Fuel efficiency']=atm['city_mpg']/atm['highway_mpg']
atm['Fuel efficiency']

In [None]:
atm['compression_ratio']

In [None]:
cats=[]
for value in atm['compression_ratio']:
    if value<9:
        cats.append('Low')
    elif 9<= value <=11:
        cats.append('Medium')
    else:
        cats.append('High')
atm['compression_ratio']=cats
atm['compression_ratio']

**5)Handling Date or Range Values**

In [None]:

#Extract ranges from normalized_losses and create bins (e.g., Low, Medium, High 
#loss categories).
atm['normalized_losses']

In [None]:
atm['normalized_losses'].unique()


In [None]:
ranges=[]
for value in atm['normalized_losses']:
    if 95 <= value >=120:
        ranges.append('Low')
    elif 121 <= value >=145:
        ranges.append('Medium')
    else:
        ranges.append('High')
               
atm['normalized_losses']=ranges
atm['normalized_losses']


In [None]:
atm

**6)Encoding Categorical Features**

In [6]:
#Label Encoding for fuel_type, aspiration, drive_wheels, etc. 
#One-Hot Encoding for body_style, engine_type, and fuel_system.

from sklearn.preprocessing import LabelEncoder

In [9]:
cat_cols = ['fuel_type', 'aspiration', 'drive_wheels']

label_encoder = LabelEncoder()
for column in cat_cols:
    atm[column] = label_encoder.fit_transform(atm[column])

print(atm.head())


   symboling  normalized_losses         make  fuel_type  aspiration  \
0          3                168  alfa-romero          1           0   
1          3                168  alfa-romero          1           0   
2          1                168  alfa-romero          1           0   
3          2                164         audi          1           0   
4          2                164         audi          1           0   

  number_of_doors   body_style  drive_wheels engine_location  wheel_base  ...  \
0             two  convertible             2           front        88.6  ...   
1             two  convertible             2           front        88.6  ...   
2             two    hatchback             2           front        94.5  ...   
3            four        sedan             1           front        99.8  ...   
4            four        sedan             0           front        99.4  ...   

   engine_size  fuel_system  bore  stroke compression_ratio horsepower  \
0          1

In [10]:

atm_encoded = pd.get_dummies(atm, columns=['body_style', 'engine_type', 'fuel_system'], drop_first=True)
print(atm_encoded.head())

   symboling  normalized_losses         make  fuel_type  aspiration  \
0          3                168  alfa-romero          1           0   
1          3                168  alfa-romero          1           0   
2          1                168  alfa-romero          1           0   
3          2                164         audi          1           0   
4          2                164         audi          1           0   

  number_of_doors  drive_wheels engine_location  wheel_base  length  ...  \
0             two             2           front        88.6   168.8  ...   
1             two             2           front        88.6   168.8  ...   
2             two             2           front        94.5   171.2  ...   
3            four             1           front        99.8   176.6  ...   
4            four             0           front        99.4   176.6  ...   

   engine_type_ohcf  engine_type_ohcv  engine_type_rotor fuel_system_2bbl  \
0             False             False  

**7)Data Scaling & Normalization**

In [None]:
#Scale numerical features (length, width, height, curb_weight, engine_size) 
#using Min-Max Scaling or Standardization. 

In [12]:
atm.dtypes

symboling                int64
normalized_losses        int64
make                    object
fuel_type                int64
aspiration               int64
number_of_doors         object
body_style              object
drive_wheels             int64
engine_location         object
wheel_base             float64
length                 float64
width                  float64
height                 float64
curb_weight              int64
engine_type             object
number_of_cylinders     object
engine_size              int64
fuel_system             object
bore                   float64
stroke                 float64
compression_ratio      float64
horsepower               int64
peak_rpm                 int64
city_mpg                 int64
highway_mpg              int64
price                    int64
dtype: object

In [16]:
colus=['length','width','height','curb_weight','engine_size']
for col in colus:
    print(col)
    print("******")
    print("min:",atm[col].min())
    print("max:",atm[col].min())
    

length
******
min: 141.1
max: 141.1
width
******
min: 60.3
max: 60.3
height
******
min: 47.8
max: 47.8
curb_weight
******
min: 1488
max: 1488
engine_size
******
min: 61
max: 61
