# Dealing With Categorical Values

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

## Importing Dataset

In [2]:
dataset = pd.read_csv("/content/Salary_Dataset.csv")

In [3]:
dataset.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


In [4]:
print(dataset.country)

0         Dubai
1        Canada
2        Canada
3        Canada
4           USA
5         Dubai
6        Canada
7     Australia
8         Dubai
9         Dubai
10        Dubai
11          USA
12        Dubai
13       Canada
14    Australia
15          USA
16          USA
17       Canada
18    Australia
19       Canada
20       Canada
21    Australia
22    Australia
23    Australia
24        Dubai
25    Australia
26        Dubai
27          USA
28          USA
29       Canada
Name: country, dtype: object


In [5]:
print(dataset['country'].to_string(index=False))

    Dubai
   Canada
   Canada
   Canada
      USA
    Dubai
   Canada
Australia
    Dubai
    Dubai
    Dubai
      USA
    Dubai
   Canada
Australia
      USA
      USA
   Canada
Australia
   Canada
   Canada
Australia
Australia
Australia
    Dubai
Australia
    Dubai
      USA
      USA
   Canada


# Lets perform Encoding

In [6]:
# # pd.get_dummies() function in Pandas to convert categorical variables into dummy/indicator variables.

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [7]:
# Get dummies with pandas in any variable
import pandas as pd

# Sample dataset with multiple categorical columns
data = {
    'country': ['Dubai','Canada','Canada','Canada','USA','Dubai','Canada','Australia','Dubai','Dubai','Dubai','USA','Dubai','Canada','Australia','USA','USA','Canada','Australia','Canada','Canada','Australia','Australia','Australia','Dubai','Australia','Dubai','USA','USA','Canada']
}

df = pd.DataFrame(data)

# Perform One-Hot Encoding on all categorical columns
df_encoded = pd.get_dummies(df, columns=['country'])

# Display the resulting DataFrame with one-hot encoded columns
print(df_encoded)


    country_Australia  country_Canada  country_Dubai  country_USA
0                   0               0              1            0
1                   0               1              0            0
2                   0               1              0            0
3                   0               1              0            0
4                   0               0              0            1
5                   0               0              1            0
6                   0               1              0            0
7                   1               0              0            0
8                   0               0              1            0
9                   0               0              1            0
10                  0               0              1            0
11                  0               0              0            1
12                  0               0              1            0
13                  0               1              0            0
14        

In [8]:
import pandas as pd

# Create the original DataFrame
df1 = pd.DataFrame(dataset)
df2 = pd.DataFrame(df_encoded)

# Split the original DataFrame into two parts (for example)
df11 = df1.iloc[0:30]  # Select the first two rows
df22 = df2.iloc[0:30]  # Select the remaining rows

# Concatenate the split DataFrames back into the original DataFrame
df = pd.concat([df1, df2], axis=1)

# Display the concatenated DataFrame
print(df)



      country    Salary  YearsExperience Purchased  country_Australia  \
0       Dubai   39343.0              1.1        No                  0   
1      Canada   46205.0              1.3       Yes                  0   
2      Canada   37731.0              1.5        No                  0   
3      Canada   43525.0              2.0        No                  0   
4         USA   39891.0              2.2        No                  0   
5       Dubai   56642.0              2.9        No                  0   
6      Canada   60150.0              3.0       Yes                  0   
7   Australia   54445.0              3.2        No                  1   
8       Dubai   64445.0              3.2       Yes                  0   
9       Dubai   57189.0              3.7        No                  0   
10      Dubai   63218.0              3.9       Yes                  0   
11        USA   55794.0              4.0        No                  0   
12      Dubai   56957.0              4.0       Yes 

In [9]:
# Drop the country column and rearrange the index of column.
# Assuming you have a DataFrame 'df' and you want to drop the 'column_name' column
df.drop('country', axis=1, inplace=True)


In [10]:
df

Unnamed: 0,Salary,YearsExperience,Purchased,country_Australia,country_Canada,country_Dubai,country_USA
0,39343.0,1.1,No,0,0,1,0
1,46205.0,1.3,Yes,0,1,0,0
2,37731.0,1.5,No,0,1,0,0
3,43525.0,2.0,No,0,1,0,0
4,39891.0,2.2,No,0,0,0,1
5,56642.0,2.9,No,0,0,1,0
6,60150.0,3.0,Yes,0,1,0,0
7,54445.0,3.2,No,1,0,0,0
8,64445.0,3.2,Yes,0,0,1,0
9,57189.0,3.7,No,0,0,1,0


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [11]:
import sklearn as skl

In [12]:
# Lets first perform label encoding
# Labelencoding will be performed on Purchased column as it has only two unique value in it.

dataset1 = pd.read_csv("/content/Salary_Dataset.csv")

In [13]:
dataset1.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


In [15]:
!pip install -U scikit-learn



In [24]:
# Lets perform one-hot encoding on country column.
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Sample data with a categorical column
data = {
    'country': ['Dubai','Canada','Canada','Canada','USA','Dubai','Canada','Australia','Dubai','Dubai','Dubai','USA','Dubai','Canada','Australia','USA','USA','Canada','Australia','Canada','Canada','Australia','Australia','Australia','Dubai','Australia','Dubai','USA','USA','Canada']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(df[['country']])

# Create a new DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['country']))

# Display the resulting DataFrame with one-hot encoded columns
print(encoded_df)


    country_Australia  country_Canada  country_Dubai  country_USA
0                 0.0             0.0            1.0          0.0
1                 0.0             1.0            0.0          0.0
2                 0.0             1.0            0.0          0.0
3                 0.0             1.0            0.0          0.0
4                 0.0             0.0            0.0          1.0
5                 0.0             0.0            1.0          0.0
6                 0.0             1.0            0.0          0.0
7                 1.0             0.0            0.0          0.0
8                 0.0             0.0            1.0          0.0
9                 0.0             0.0            1.0          0.0
10                0.0             0.0            1.0          0.0
11                0.0             0.0            0.0          1.0
12                0.0             0.0            1.0          0.0
13                0.0             1.0            0.0          0.0
14        



In [25]:
# Convert all float columns to integers

# Create a DataFrame
df = pd.DataFrame(encoded_df)


for column in df.columns:
    if df[column].dtype == float:
        df[column] = df[column].astype(int)

In [26]:
df

Unnamed: 0,country_Australia,country_Canada,country_Dubai,country_USA
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,0,1
5,0,0,1,0
6,0,1,0,0
7,1,0,0,0
8,0,0,1,0
9,0,0,1,0


In [27]:
#Lets link the headers name.
import pandas as pd

# Create the original DataFrame
df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(df)

# Split the original DataFrame into two parts (for example)
df11 = df1.iloc[0:30]  # Select the first two rows
df22 = df2.iloc[0:30]  # Select the remaining rows

# Concatenate the split DataFrames back into the original DataFrame
df = pd.concat([df1, df2], axis=1)

# Display the concatenated DataFrame
print(df)



      country    Salary  YearsExperience Purchased  country_Australia  \
0       Dubai   39343.0              1.1        No                  0   
1      Canada   46205.0              1.3       Yes                  0   
2      Canada   37731.0              1.5        No                  0   
3      Canada   43525.0              2.0        No                  0   
4         USA   39891.0              2.2        No                  0   
5       Dubai   56642.0              2.9        No                  0   
6      Canada   60150.0              3.0       Yes                  0   
7   Australia   54445.0              3.2        No                  1   
8       Dubai   64445.0              3.2       Yes                  0   
9       Dubai   57189.0              3.7        No                  0   
10      Dubai   63218.0              3.9       Yes                  0   
11        USA   55794.0              4.0        No                  0   
12      Dubai   56957.0              4.0       Yes 

In [28]:
df.drop('country', axis=1, inplace=True)

In [29]:
df

Unnamed: 0,Salary,YearsExperience,Purchased,country_Australia,country_Canada,country_Dubai,country_USA
0,39343.0,1.1,No,0,0,1,0
1,46205.0,1.3,Yes,0,1,0,0
2,37731.0,1.5,No,0,1,0,0
3,43525.0,2.0,No,0,1,0,0
4,39891.0,2.2,No,0,0,0,1
5,56642.0,2.9,No,0,0,1,0
6,60150.0,3.0,Yes,0,1,0,0
7,54445.0,3.2,No,1,0,0,0
8,64445.0,3.2,Yes,0,0,1,0
9,57189.0,3.7,No,0,0,1,0
