In [1]:
import pandas as pd

In [2]:
exdata = pd.read_csv("randomdata.csv")

In [3]:
exdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
Unnamed: 0    50 non-null int64
age           50 non-null int64
job           50 non-null object
marital       50 non-null object
education     50 non-null object
default       50 non-null object
balance       50 non-null int64
housing       50 non-null object
loan          50 non-null object
contact       50 non-null object
dtypes: int64(3), object(7)
memory usage: 4.0+ KB


In [4]:
exdata.head()

Unnamed: 0.1,Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact
0,14789,45,blue-collar,single,secondary,yes,0,no,yes,cellular
1,8968,41,management,married,tertiary,no,5,no,no,unknown
2,34685,40,management,single,secondary,no,906,yes,no,cellular
3,2369,25,admin.,single,secondary,no,768,yes,no,unknown
4,36561,37,services,married,primary,no,0,yes,no,cellular


In [5]:
exdata.drop('Unnamed: 0', inplace=True, axis=1)

In [6]:
exdata.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact
0,45,blue-collar,single,secondary,yes,0,no,yes,cellular
1,41,management,married,tertiary,no,5,no,no,unknown
2,40,management,single,secondary,no,906,yes,no,cellular
3,25,admin.,single,secondary,no,768,yes,no,unknown
4,37,services,married,primary,no,0,yes,no,cellular


#### here we are convert categorical variable to numerical variable with different ways

**One Hot Encoding**
- One hot encoding, is very useful but it can cause the number of columns to expand greatly if you have very many unique values in a column. 
- Pandas supports this feature using **get_dummies**. This function is named this way because it creates dummy/indicator variables (aka 1 or 0).

In [7]:
pd.get_dummies(exdata, columns=["marital"]).head() # we can other otherthan marital feature also

Unnamed: 0,age,job,education,default,balance,housing,loan,contact,marital_divorced,marital_married,marital_single
0,45,blue-collar,secondary,yes,0,no,yes,cellular,0,0,1
1,41,management,tertiary,no,5,no,no,unknown,0,1,0
2,40,management,secondary,no,906,yes,no,cellular,0,0,1
3,25,admin.,secondary,no,768,yes,no,unknown,0,0,1
4,37,services,primary,no,0,yes,no,cellular,0,1,0


In [8]:
#Now we will try other parameter of this method
pd.get_dummies(exdata, columns=["marital"], prefix='M', prefix_sep='.').head()

Unnamed: 0,age,job,education,default,balance,housing,loan,contact,M.divorced,M.married,M.single
0,45,blue-collar,secondary,yes,0,no,yes,cellular,0,0,1
1,41,management,tertiary,no,5,no,no,unknown,0,1,0
2,40,management,secondary,no,906,yes,no,cellular,0,0,1
3,25,admin.,secondary,no,768,yes,no,unknown,0,0,1
4,37,services,primary,no,0,yes,no,cellular,0,1,0


In [9]:
#If there are n dumies variable then we can use in data set n-1 for example if sex male/female here n = 2 (male or female) 
# if we know either one mean if we know someone is male then by default that is not female so n-1 is required 

# Here we are droping M.divorced since we can get this with the help of others marital type i.e. 
# if all marital type value is 0 it means it is divorced

pd.get_dummies(exdata, columns=["marital"], prefix='M', prefix_sep='.', drop_first=True).head()

Unnamed: 0,age,job,education,default,balance,housing,loan,contact,M.married,M.single
0,45,blue-collar,secondary,yes,0,no,yes,cellular,0,1
1,41,management,tertiary,no,5,no,no,unknown,1,0
2,40,management,secondary,no,906,yes,no,cellular,0,1
3,25,admin.,secondary,no,768,yes,no,unknown,0,1
4,37,services,primary,no,0,yes,no,cellular,1,0


In [10]:
# We can pass list argument also
pd.get_dummies(exdata, columns=["marital", "default"], prefix=["M", "D"]).head()

Unnamed: 0,age,job,education,balance,housing,loan,contact,M_divorced,M_married,M_single,D_no,D_yes
0,45,blue-collar,secondary,0,no,yes,cellular,0,0,1,0,1
1,41,management,tertiary,5,no,no,unknown,0,1,0,1,0
2,40,management,secondary,906,yes,no,cellular,0,0,1,1,0
3,25,admin.,secondary,768,yes,no,unknown,0,0,1,1,0
4,37,services,primary,0,yes,no,cellular,0,1,0,1,0


**Label Encoding**
- Label encoding is simply converting each value in a column to a number
- It Can only use with a 'category' dtype
- The nice aspect of this approach is that you get the benefits of pandas categories (compact data size, ability to order, plotting support) but can easily be converted to numeric values for further analysis.
- Label encoding has the advantage that it is straightforward but it has the disadvantage that the numeric values can be “misinterpreted” by the algorithms

In [11]:
temp = exdata[::]
temp["job"] = temp["job"].astype('category').cat.codes
temp.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact
0,45,1,single,secondary,yes,0,no,yes,cellular
1,41,3,married,tertiary,no,5,no,no,unknown
2,40,3,single,secondary,no,906,yes,no,cellular
3,25,0,single,secondary,no,768,yes,no,unknown
4,37,5,married,primary,no,0,yes,no,cellular


- In above table we can see it assign the numerical value to each job type

In [12]:
# delete the temperory dataframe
del temp

### Scikit-Learn methods

- LabelEncoder 

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
temp = exdata.copy()
lb_make = LabelEncoder()
temp["education_code"] = lb_make.fit_transform(exdata["education"])
temp[["education", "education_code"]].head()

Unnamed: 0,education,education_code
0,secondary,1
1,tertiary,2
2,secondary,1
3,secondary,1
4,primary,0


- LabelBinarizer

In [15]:
from sklearn.preprocessing import LabelBinarizer

In [16]:
lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(exdata["education"])
result_frame = pd.DataFrame(lb_results, columns=lb_style.classes_)
result_frame.head()

Unnamed: 0,primary,secondary,tertiary,unknown
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [17]:
# Here now we are concating the labelBinarizer result in to dataset and now education column has no use since we conver it into encoding
temp = exdata.copy()
temp.drop('education', axis=1, inplace=True)
pd.concat([temp, result_frame], axis=1).head()

Unnamed: 0,age,job,marital,default,balance,housing,loan,contact,primary,secondary,tertiary,unknown
0,45,blue-collar,single,yes,0,no,yes,cellular,0,1,0,0
1,41,management,married,no,5,no,no,unknown,0,0,1,0
2,40,management,single,no,906,yes,no,cellular,0,1,0,0
3,25,admin.,single,no,768,yes,no,unknown,0,1,0,0
4,37,services,married,no,0,yes,no,cellular,1,0,0,0


In [18]:
# exdata