### Perform various data preprocessing techniques like handling missing data and feature scaling.

#### step 1: Start by importing the necessary Python libraries for data preprocessing.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#### Step 2: Load the placement dataset into a Pandas Dataframe.

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Automobile.csv to Automobile.csv


#### Step 3:Take a quick look at the data to understand its structure and identify any missing values or anomalies.

In [None]:
DF = pd.read_csv("Automobile.csv",index_col=0)
display(DF.head())

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8.0,307.0,130.0,3504.0,12.0,70,usa
buick skylark 320,15.0,8.0,350.0,165.0,3693.0,11.5,70,usa
plymouth satellite,18.0,8.0,318.0,150.0,3436.0,11.0,70,usa
amc rebel sst,16.0,8.0,304.0,150.0,3433.0,12.0,70,usa
ford torino,17.0,,302.0,140.0,3449.0,10.5,70,usa


#### The method isnull() checks each element in the DataFrame (or Series) to see if it is NaN (Not a Number) or None (missing value).
It returns a DataFrame (or Series) of the same shape as the input, with Boolean values:
#### True: The value is null (NaN or None).
#### False: The value is not null.

In [None]:
DF.isnull().sum

#### Step 4: Handle Missing Data
#### Option 1: If the dataset is large and only a small percentage of data is missing, you can remove rows with missing values using dropna(subset,inplace)


In [None]:
DF.dropna(subset=["cylinders"],inplace=True)

#### Option 2:If removing data isn't ideal, you can impute (df.[""].fillna(df[""].mean(),inplace)) missing values using methods like mean, median, or most frequent.

In [None]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 395 entries, chevrolet chevelle malibu to chevy s-10
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           395 non-null    float64
 1   cylinders     395 non-null    float64
 2   displacement  392 non-null    float64
 3   horsepower    383 non-null    float64
 4   weight        393 non-null    float64
 5   acceleration  392 non-null    float64
 6   model_year    395 non-null    int64  
 7   origin        395 non-null    object 
dtypes: float64(6), int64(1), object(1)
memory usage: 27.8+ KB


#### Step 5: Feature Scaling


<img src="https://i.postimg.cc/G21gMYnF/f.png" alt="Image Description" width="500">









 Option 1: This method scales the data to have a mean of 0 and a standard deviation of 1.
### StandardScaler()

In [None]:
c=["mpg","cylinders","displacement","horsepower","weight","acceleration", "model_year"]
SC1=StandardScaler()
DF[c]=SC1.fit_transform(DF[c])
DF.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,-0.708724,1.507879,1.099204,0.678741,0.639265,-1.317432,-1.63972,usa
buick skylark 320,-1.09293,1.507879,1.51295,1.598805,0.862859,-1.500747,-1.63972,usa
plymouth satellite,-0.708724,1.507879,1.205046,1.204492,0.558819,-1.684061,-1.63972,usa
amc rebel sst,-0.964862,1.507879,1.070338,1.204492,0.55527,-1.317432,-1.63972,usa
ford galaxie 500,-1.09293,1.507879,2.273089,2.466294,1.629464,-2.05069,-1.63972,usa


#### Option 2:This method scales the data to a fixed range, usually between 0 and 1.
###  MinMaxScaler()

In [None]:
c=["mpg","cylinders","displacement","horsepower","weight","acceleration", "model_year"]
SC2=MinMaxScaler()
DF[c]=SC2.fit_transform(DF[c])
DF.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,0.239362,1.0,0.617571,0.456522,0.53615,0.238095,0.0,usa
buick skylark 320,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,usa
plymouth satellite,0.239362,1.0,0.645995,0.565217,0.51687,0.178571,0.0,usa
amc rebel sst,0.18617,1.0,0.609819,0.565217,0.516019,0.238095,0.0,usa
ford galaxie 500,0.159574,1.0,0.932817,0.826087,0.773462,0.119048,0.0,usa


####  Step 6:Separate the dataset into features (X) and target (y) variables. The target is usually the column you want to predict.

In [None]:
X=DF[["mpg","cylinders","displacement","horsepower","weight","acceleration", "model_year"]]
Y=DF["cylinders"]
Y.head()

Unnamed: 0_level_0,cylinders
name,Unnamed: 1_level_1
chevrolet chevelle malibu,1.0
buick skylark 320,1.0
plymouth satellite,1.0
amc rebel sst,1.0
ford galaxie 500,1.0



### Step 7: After preprocessing, save the cleaned and scaled dataset to a new CSV file


In [None]:
# Lab-1 Activities

#Perform data preprocesing for Automobile.csv

#i. Delete the column horsepower since it has few missing values

#ii. Impute missing with meadin

#iii. Apply min-max scaling and standardization on the Automobiles.csv and provide the reasoning which feature scaling method make more sense to this dataset.

final=pd.concat([X,Y],axis=1)
final.to_csv("Pre.csv",index=False)

In [None]:
from google.colab import files
files.download("Pre.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>