In [None]:
# https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
# https://archive.ics.uci.edu/static/public/10/data.csv

In [None]:
# column Name - 
# normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,symboling

In [None]:
import pandas as pd

# column name - 
columns = [
    "symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length","width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","price"
    
]

# load url 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url,names=columns,na_values="?",sep=",",skipinitialspace=True,engine='python')

#df.to_csv("auto_mobile.csv")



In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
data_unique = ['make','fuel-type','aspiration','num-of-doors','bore','body-style','drive-wheels','engine-location','num-of-cylinders','fuel-system']
for col in data_unique:
    print(f'{col} : {df[col].unique()}')

In [None]:
df.isna().sum()

In [None]:
# handling missing value 
mean_fuel_system = df['fuel-system'].mean()
df['fuel-system'].fillna(mean_fuel_system,inplace=True)
print(df['fuel-system'].head())

In [None]:
mean_price = df['price'].mean()
df['price'].fillna(mean_price,inplace=True)
print(df['price'].head())

In [None]:
bore = df['bore'].mean()
df['bore'].fillna(bore,inplace=True)
print(df['bore'].head())

In [None]:
# symboling
symboling = df['symboling'].mean()
df['symboling'].fillna(symboling,inplace=True)
print(df['symboling'].head())

In [None]:
# horsepower
horsepower = df['horsepower'].mean()
df['horsepower'].fillna(horsepower,inplace=True)
print(df['horsepower'].head())

In [None]:
# compression-ratio
compression_ratio = df['compression-ratio'].mean()
df['compression-ratio'].fillna(compression_ratio,inplace=True)
print(df['compression-ratio'].head())

In [None]:
mode_aspiration = df['aspiration'].mode()[0]
df['aspiration'].fillna(mode_aspiration,inplace=True)
print(df['aspiration'].head())

In [None]:
df.dtypes


In [None]:
df.isna().sum()

In [None]:
df.to_csv("auto_mobile.csv")

In [None]:
# outlier Detection and handling 
# IQR method & Z - score

df.describe()

In [None]:
# IQR
q1 = df['horsepower'].quantile(0.25)
q3 = df['horsepower'].quantile(0.75)
iqr = q3-q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
outliers = df[(df['horsepower'] < lower) | (df['horsepower'] > upper)]
print(outliers[['make','horsepower']])
print(lower)
print(upper)
print(iqr)

In [None]:
# Z - score method 
%pip install scipy

In [None]:
from scipy import stats
import numpy as np
df['zscore'] = stats.zscore(df['horsepower'])
z_outliers = df[np.abs(df['zscore']) > 3]
print(z_outliers[['make','horsepower']])

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Encoding categorical Data (label Encoding one - hot )
df_onehot = pd.get_dummies(df , columns=['drive-wheels'],prefix='drive')
df_onehot.head()

In [None]:
# step 2 label (for ordinal has order) manual
cyl_map = {'four':4,'five':5,'two':2}
df['aspiration_encoded'] = df['aspiration'].map(cyl_map)
print(df[['aspiration','aspiration_encoded']])

In [None]:
df['num-of-cylinders'].tail()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# Data 3 Data Scalling & Normalization(min-max ,Standardization)
# min-max scaling to (0,1)
df['wheel-base-minmax'] = (df['wheel-base'] - df['wheel-base'].min()) / (df['wheel-base'].max() - df['wheel-base'].min())
# print(df[['wheel-base','wheel-base-minmax']].head())
print(df['wheel-base-minmax'])

In [None]:
# standardization (mean=0 , std=1) for wheel-base
df['wheel-base-std'] = (df['wheel-base'] - df['wheel-base'].mean()) / df['wheel-base'].std()
print(df['wheel-base-std'])

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
# feature Engineering & feature selection 
# step 1 - feature selection (correlation)
corr_matrix = df.corr(numeric_only=True)
print(corr_matrix['price'].sort_values(ascending=False))

In [None]:
# manual feature selection
features = ['engine-size','curb-weight','horsepower','width','length','wheel-base']
X = df[features]
y = df['price']

#print(X.head())
print(y.head())

In [None]:
df.dtypes

In [None]:
# add new feature - avg mpg 
df['avg-mpg'] = (df['city-mpg'] + df['peak-rpm']) / 2
print(df['avg-mpg'].head())


In [62]:
# handling imbalance data (Smote,undersampling,oversampling)
mean_price = df['price'].mean()
df['high_price'] = np.where(df['price'] > mean_price, 1, 0)
print(df['high_price'].value_counts().head())

high_price
0    131
1     74
Name: count, dtype: int64


In [63]:
# undersampling (reduce majority class)
majority = df[df['high_price'] == 0]
minority = df[df['high_price'] == 1]
majority_under = majority.sample(len(minority))
df_under = pd.concat([majority_under,minority],axis=0)
print(df_under['high_price'].value_counts())

high_price
0    74
1    74
Name: count, dtype: int64
