In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('titanic.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
encoder1 = LabelEncoder()

# Text To Numbers

### So we need columns having categorical data
Instead of handpicking them we can simply do it by a method written below

In [6]:
data_cat = list(data.select_dtypes(include=['object']).columns.values)
print(data_cat)

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


## Now we are gonna apply label encoder on each of them one by one

In [7]:
for cat in data_cat:
    print(cat)
    data[cat] = encoder1.fit_transform(data[cat])

Name
Sex
Ticket
Cabin


TypeError: '<' not supported between instances of 'str' and 'float'

# Its common to get this error .
## This means that your categorical column has str as well as float or null values. But Our data has null values only in Age column so we need to change the datatype to string then it will run

In [8]:
for cat in data_cat:
    print(cat)
    data[cat] = encoder1.fit_transform(data[cat].astype(str))

Name
Sex
Ticket
Cabin
Embarked


In [9]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,11,1,22.0,1,0,472,7.25,147,2
1,2,1,1,102,0,38.0,1,0,552,71.2833,81,0
2,3,1,3,283,0,26.0,0,0,633,7.925,147,2
3,4,1,1,193,0,35.0,1,0,434,53.1,55,2
4,5,0,3,57,1,35.0,0,0,415,8.05,147,2


# Done.

## But, there is a little problem our ML algo will assume that two nearby values are more similar than two distict values, i.e. 1 is more similar to 2 than 3 or 4.

# Here comes the concept of ONE HOT ENCODING
## 1 -> HOT
## 0 -> COLD

# Numbers to One-Hot

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
encoder2 = OneHotEncoder()

In [12]:
data_cat

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

#### Text to number gave us 1D array
#### But fit_transform requires 2D array
#### So we need to reshape it (-1,1)
#### And yeah dont forget to write .values if converting the dataframe

In [13]:
for cat in data_cat:
    print(cat)
    data[cat] = encoder2.fit_transform(data[cat].values.reshape(-1,1))

Name
Sex
Ticket
Cabin
Embarked


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [14]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"(0, 11)\t1.0\n (1, 102)\t1.0\n (2, 283)\t1...","(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 0)\t1.0\n ...",22.0,1,0,"(0, 472)\t1.0\n (1, 552)\t1.0\n (2, 633)\t...",7.25,"(0, 147)\t1.0\n (1, 81)\t1.0\n (2, 147)\t1...","(0, 2)\t1.0\n (1, 0)\t1.0\n (2, 2)\t1.0\n ..."
1,2,1,1,"(0, 11)\t1.0\n (1, 102)\t1.0\n (2, 283)\t1...","(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 0)\t1.0\n ...",38.0,1,0,"(0, 472)\t1.0\n (1, 552)\t1.0\n (2, 633)\t...",71.2833,"(0, 147)\t1.0\n (1, 81)\t1.0\n (2, 147)\t1...","(0, 2)\t1.0\n (1, 0)\t1.0\n (2, 2)\t1.0\n ..."
2,3,1,3,"(0, 11)\t1.0\n (1, 102)\t1.0\n (2, 283)\t1...","(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 0)\t1.0\n ...",26.0,0,0,"(0, 472)\t1.0\n (1, 552)\t1.0\n (2, 633)\t...",7.925,"(0, 147)\t1.0\n (1, 81)\t1.0\n (2, 147)\t1...","(0, 2)\t1.0\n (1, 0)\t1.0\n (2, 2)\t1.0\n ..."
3,4,1,1,"(0, 11)\t1.0\n (1, 102)\t1.0\n (2, 283)\t1...","(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 0)\t1.0\n ...",35.0,1,0,"(0, 472)\t1.0\n (1, 552)\t1.0\n (2, 633)\t...",53.1,"(0, 147)\t1.0\n (1, 81)\t1.0\n (2, 147)\t1...","(0, 2)\t1.0\n (1, 0)\t1.0\n (2, 2)\t1.0\n ..."
4,5,0,3,"(0, 11)\t1.0\n (1, 102)\t1.0\n (2, 283)\t1...","(0, 1)\t1.0\n (1, 0)\t1.0\n (2, 0)\t1.0\n ...",35.0,0,0,"(0, 472)\t1.0\n (1, 552)\t1.0\n (2, 633)\t...",8.05,"(0, 147)\t1.0\n (1, 81)\t1.0\n (2, 147)\t1...","(0, 2)\t1.0\n (1, 0)\t1.0\n (2, 2)\t1.0\n ..."


# for better understanding lets just try it on only one attribute

In [15]:
df = pd.read_csv('titanic.csv')

In [16]:
a = encoder1.fit_transform(df['Sex']) #text to numbers

In [17]:
a = encoder2.fit_transform(df['Sex'].values.reshape(-1,1)) #numbers to one hot

In [18]:
a

<891x2 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

# Output is Scipy Sparse Matrix [Each row has only one row and rest all zero]

## It just stores location of 1's to prevent wastage of memory for storing zeroes

In [19]:
print(a[:10])

  (0, 1)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 1)	1.0
  (5, 1)	1.0
  (6, 1)	1.0
  (7, 1)	1.0
  (8, 0)	1.0
  (9, 0)	1.0


# What We DID:
## -> Text To Numbers 
## Then
## -> Numbers to One-Hot
# What About
## -> Text To One Hot Directly

In [20]:
data_again = pd.read_csv('titanic.csv')

In [21]:
data_again.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
data_cat

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [23]:
from sklearn.preprocessing import LabelBinarizer 

# if you want sparse matrix as output then just write
#### encoder = LabelBinarizer(sparse_output=True)

In [24]:
encoder3 = LabelBinarizer()

In [None]:
for cat in data_cat:
    print(cat)
    data_again[cat] = encoder3.fit_transform(data_again[cat].astype(str))

In [None]:
data_again.head()

# Its Done.