<a href="https://colab.research.google.com/github/Santanukolkata/Data_Science/blob/master/Models/Preprocessing/One_hot_encoding_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Encode categorical features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the sparse parameter)

By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the categories manually.

This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.

Note: a one-hot encoding of y labels should use a LabelBinarizer instead.

In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [0]:
data = { 'Company' :['Huawei', 'BT', 'TCS'],
        'PLSQL': [3.2,0,0],
        'OBIEE': [0, 1.08, .6],
        'BICP': [0, 1.08, 1],
        'PYTHON': [0, 0, 2],
        'Y_AXIS':[.1,1.1,2.1],
        'X_AXIS' :[1.6,4.36,7.8],
        'TEXT':['<--Aug10--Oct13-->','<-Dec13--Jan16->','<-Jul16--Current->']}
df_exp =pd.DataFrame.from_dict(data)
df_exp

Unnamed: 0,Company,PLSQL,OBIEE,BICP,PYTHON,Y_AXIS,X_AXIS,TEXT
0,Huawei,3.2,0.0,0.0,0,0.1,1.6,<--Aug10--Oct13-->
1,BT,0.0,1.08,1.08,0,1.1,4.36,<-Dec13--Jan16->
2,TCS,0.0,0.6,1.0,2,2.1,7.8,<-Jul16--Current->


In [0]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.array(df_exp['Company']).reshape(-1,1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [0]:
Columns =[comp for comp in items for items in enc.categories_]
Columns

['BT', 'Huawei', 'TCS']

In [0]:
enc.transform(np.array(df_exp['Company']).reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [0]:
Columns
df_encoded=pd.DataFrame(enc.transform(np.array(df_exp['Company']).reshape(-1,1)).toarray(),columns=Columns)
df_encoded.columns

Index(['BT', 'Huawei', 'TCS'], dtype='object')

In [0]:
df_new = df_exp.merge(df_encoded,how='inner',left_index=True,right_index=True)

In [0]:
df_new

Unnamed: 0,Company,PLSQL,OBIEE,BICP,PYTHON,Y_AXIS,X_AXIS,TEXT,BT,Huawei,TCS
0,Huawei,3.2,0.0,0.0,0,0.1,1.6,<--Aug10--Oct13-->,0.0,1.0,0.0
1,BT,0.0,1.08,1.08,0,1.1,4.36,<-Dec13--Jan16->,1.0,0.0,0.0
2,TCS,0.0,0.6,1.0,2,2.1,7.8,<-Jul16--Current->,0.0,0.0,1.0


In [0]:
df_new.columns

Index([  'Company',     'PLSQL',     'OBIEE',      'BICP',    'PYTHON',
          'Y_AXIS',    'X_AXIS',      'TEXT',     ('BT',), ('Huawei',),
          ('TCS',)],
      dtype='object')

In [0]:
pd.concat([df_exp, df_encoded], axis=1)

Unnamed: 0,Company,PLSQL,OBIEE,BICP,PYTHON,Y_AXIS,X_AXIS,TEXT,"(BT,)","(Huawei,)","(TCS,)"
0,Huawei,3.2,0.0,0.0,0,0.1,1.6,<--Aug10--Oct13-->,0.0,1.0,0.0
1,BT,0.0,1.08,1.08,0,1.1,4.36,<-Dec13--Jan16->,1.0,0.0,0.0
2,TCS,0.0,0.6,1.0,2,2.1,7.8,<-Jul16--Current->,0.0,0.0,1.0
