## Machine Learning with Text in scikit-learn

## Part 1:Model building in scikit-learn (refresher)

In [2]:
# Load the iris dataset as an example
from sklearn.datasets import load_iris
iris=load_iris()

In [3]:
# Store the feature matrix (X) and response vector (y)
X=iris.data
y=iris.target

In [6]:
# Check for the shape of X and y
print(X.shape)
print(y.shape)

(150, 4)
(150,)
<class 'numpy.ndarray'>


In [7]:
# Examine the first five rows of the feature matrix (including the feature)
import pandas as pd
pd.DataFrame(X,columns=iris.feature_names).head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# Examine the response vector
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [None]:
# In order to build a model,the feature must be numeric,must have the same feature in the same order.

In [9]:
# Import the class
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the model (with default parameters)
knn=KNeighborsClassifier()

# Fit the model
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
# In order to make predictions, the new observation must have the same features as the training observations,
# both in number and meaning

# Predict the response for a new observation
knn.predict([[3,5,4,2]])

array([1])

## Part 2: Representing text as numerical data 

In [13]:
# Example text for model training (SMS message)
simple_train=(['call you tonight', 'Call me a cab', 'please call me... PLEASE!'])

In [15]:
# We will use CountVectorizer to "convert text into a matrix of token counts":

# Import and intanstantiate CountVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()

In [16]:
# Learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
# Examine the fitted vocabulary
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [18]:
# Transform the training data into a 'document term matrix'
simple_train_dtm=vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [19]:
# Convert sparse matrix to a dense matrix 
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [20]:
# Examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [21]:
# Check the type of document term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [22]:
# Examine the sparse matrix contents
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2
