# Text processing

## Installation

In [None]:
# pip install scikit-learn

## Imports

In [24]:
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

## Implementation

In [2]:
corpus = [
    "Scikit-learn is a free software machine learning library for the Python programming language.",
    "This library is focused on classical machine learning algorithms.",
    "It also provides tools for data mining and data analysis.",
]

In [4]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus)

In [5]:
transformed_corpus = count_vectorizer.transform(corpus)
transformed_corpus

<3x26 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [6]:
transformed_corpus.todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
         1, 1, 1, 0, 0],
        [1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 1, 0],
        [0, 1, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
         0, 0, 0, 0, 1]], dtype=int64)

In [7]:
count_vectorizer.vocabulary_

{'scikit': 21,
 'learn': 12,
 'is': 9,
 'free': 8,
 'software': 22,
 'machine': 15,
 'learning': 13,
 'library': 14,
 'for': 7,
 'the': 23,
 'python': 20,
 'programming': 18,
 'language': 11,
 'this': 24,
 'focused': 6,
 'on': 17,
 'classical': 4,
 'algorithms': 0,
 'it': 10,
 'also': 1,
 'provides': 19,
 'tools': 25,
 'data': 5,
 'mining': 16,
 'and': 3,
 'analysis': 2}

In [8]:
count_vectorizer.inverse_transform(transformed_corpus)

[array(['for', 'free', 'is', 'language', 'learn', 'learning', 'library',
        'machine', 'programming', 'python', 'scikit', 'software', 'the'],
       dtype='<U11'),
 array(['algorithms', 'classical', 'focused', 'is', 'learning', 'library',
        'machine', 'on', 'this'], dtype='<U11'),
 array(['also', 'analysis', 'and', 'data', 'for', 'it', 'mining',
        'provides', 'tools'], dtype='<U11')]

In [10]:
modified_count_vectorizer = CountVectorizer(binary = True, max_features = 10)
modified_count_vectorizer.fit(corpus)

In [12]:
modified_count_vectorizer.transform(corpus).todense()

matrix([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 1, 1, 1, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [17]:
emoji_corpus = [
    "üêçüî•",
    "üî•üî•üî•",
    "üêçüêçüêçüêç",
]

In [14]:
def emoji_tokenizer(text):
    return re.findall(r'[\U0001F000-\U0001F6FF]', text)

print(emoji_tokenizer("üêçüëçüèÜ"))

['üêç', 'üëç', 'üèÜ']


In [20]:
emoji_vectorizer = CountVectorizer(tokenizer = emoji_tokenizer)
emoji_vectorizer.fit_transform(emoji_corpus)



<3x2 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [22]:
emoji_vectorizer.vocabulary_

{'üêç': 0, 'üî•': 1}