<a href="https://colab.research.google.com/github/RayOfLight07/Elements_of_AI_ML/blob/main/Experiment_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implement feature scaling and one hot encoding data preprocessing
techniques on the dataset imported in lab 4 or any other dataset.

# **Upload the Dataset**

In [None]:
from google.colab import files
import pandas as pd

# Upload the dataset
uploaded = files.upload()

# Load the dataset from the uploaded file
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

# Display the first few rows of the dataset
print("\nOriginal Dataset:")
print(df.head())


Saving sample_invoice_data.csv to sample_invoice_data.csv

Original Dataset:
  Invoice ID     Branch       City Customer type  Gender Product line  \
0   INV-8332  Bangalore     Mumbai       Premium    Male    Furniture   
1   INV-1309     Mumbai      Delhi       Premium  Female    Furniture   
2   INV-7661      Delhi    Chennai       Regular  Female    Groceries   
3   INV-5227      Delhi  Bangalore       Regular    Male  Electronics   
4   INV-2877     Mumbai     Mumbai       Premium  Female  Electronics   

   Unit price  Quantity        Date   Time Payment  Tax 18%    Total  \
0      605.47         3  2023-09-17  14:43     UPI   326.95  2143.36   
1       97.27         5  2023-12-10  13:13    Cash    87.54   573.89   
2     1268.79         2  2023-03-25  11:26    Card   456.76  2994.34   
3      542.51         1  2023-02-15  17:24     UPI    97.65   640.16   
4     1354.46         4  2023-06-08  13:03     UPI   975.21  6393.05   

   gross income  
0        326.95  
1         87.54

# **Identify Numerical and Categorical Columns**

In [None]:
# Define numerical and categorical columns
numerical_columns = ['Unit price', 'Quantity', 'Tax 18%', 'Total', 'gross income']
categorical_columns = ['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Payment']

# Display the columns for reference
print("\nNumerical Columns:", numerical_columns)
print("\nCategorical Columns:", categorical_columns)



Numerical Columns: ['Unit price', 'Quantity', 'Tax 18%', 'Total', 'gross income']

Categorical Columns: ['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Payment']


# **Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Create copies of the dataset for scaling
df_scaled = df.copy()

# Apply StandardScaler to numerical columns (e.g., 'Unit price' and 'Quantity')
scaler_standard = StandardScaler()
df_scaled['Unit price Standardized'] = scaler_standard.fit_transform(df[['Unit price']])
df_scaled['Quantity Standardized'] = scaler_standard.fit_transform(df[['Quantity']])

# Apply MinMaxScaler to 'Total' and 'gross income'
scaler_minmax = MinMaxScaler()
df_scaled['Total Normalized'] = scaler_minmax.fit_transform(df[['Total']])
df_scaled['gross income Normalized'] = scaler_minmax.fit_transform(df[['gross income']])

print("\nDataset after Feature Scaling:")
print(df_scaled[['Unit price', 'Unit price Standardized', 'Quantity', 'Quantity Standardized', 'Total', 'Total Normalized', 'gross income', 'gross income Normalized']].head())



Dataset after Feature Scaling:
   Unit price  Unit price Standardized  Quantity  Quantity Standardized  \
0      605.47                -0.735081         3               0.211604   
1       97.27                -1.463764         5               1.622295   
2     1268.79                 0.216019         2              -0.493742   
3      542.51                -0.825357         1              -1.199088   
4     1354.46                 0.338857         4               0.916949   

     Total  Total Normalized  gross income  gross income Normalized  
0  2143.36          0.222280        326.95                 0.222275  
1   573.89          0.052940         87.54                 0.052936  
2  2994.34          0.314097        456.76                 0.314092  
3   640.16          0.060091         97.65                 0.060087  
4  6393.05          0.680805        975.21                 0.680801  


# **One-Hot Encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply OneHotEncoder to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop the first category to avoid the dummy variable trap
encoded_categories = encoder.fit_transform(df[categorical_columns])

# Create a DataFrame for encoded columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_categories, columns=encoded_columns, index=df.index)

print("\nOne-Hot Encoded Columns:")
print(encoded_df.head())


One-Hot Encoded Columns:
   Branch_Delhi  Branch_Mumbai  City_Chennai  City_Delhi  City_Mumbai  \
0           0.0            0.0           0.0         0.0          1.0   
1           0.0            1.0           0.0         1.0          0.0   
2           1.0            0.0           1.0         0.0          0.0   
3           1.0            0.0           0.0         0.0          0.0   
4           0.0            1.0           0.0         0.0          1.0   

   Customer type_Regular  Gender_Male  Product line_Clothing  \
0                    0.0          1.0                    0.0   
1                    0.0          0.0                    0.0   
2                    1.0          0.0                    0.0   
3                    1.0          1.0                    0.0   
4                    0.0          0.0                    0.0   

   Product line_Electronics  Product line_Furniture  Product line_Groceries  \
0                       0.0                     1.0                    

# **Combine Scaled and Encoded Data**

In [None]:
# Combine scaled numerical and one-hot encoded categorical data
df_processed = pd.concat([df_scaled, encoded_df], axis=1)

# Display the processed dataset
print("\nProcessed Dataset with Feature Scaling and One-Hot Encoding:")
print(df_processed.head())



Processed Dataset with Feature Scaling and One-Hot Encoding:
  Invoice ID     Branch       City Customer type  Gender Product line  \
0   INV-8332  Bangalore     Mumbai       Premium    Male    Furniture   
1   INV-1309     Mumbai      Delhi       Premium  Female    Furniture   
2   INV-7661      Delhi    Chennai       Regular  Female    Groceries   
3   INV-5227      Delhi  Bangalore       Regular    Male  Electronics   
4   INV-2877     Mumbai     Mumbai       Premium  Female  Electronics   

   Unit price  Quantity        Date   Time  ... City_Delhi  City_Mumbai  \
0      605.47         3  2023-09-17  14:43  ...        0.0          1.0   
1       97.27         5  2023-12-10  13:13  ...        1.0          0.0   
2     1268.79         2  2023-03-25  11:26  ...        0.0          0.0   
3      542.51         1  2023-02-15  17:24  ...        0.0          0.0   
4     1354.46         4  2023-06-08  13:03  ...        0.0          1.0   

   Customer type_Regular  Gender_Male  Product l

# **Save and Download the Processed Datase**

In [None]:
# Save the processed dataset
output_file_name = "/mnt/data/processed_invoice_data.csv"
df_processed.to_csv(output_file_name, index=False)

# Provide download link
from google.colab import files
files.download(output_file_name)


OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [None]:
import os

# Save the processed dataset
output_file_name = "/mnt/data/processed_invoice_data.csv"

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)

df_processed.to_csv(output_file_name, index=False)

# Provide download link
from google.colab import files
files.download(output_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>