Imports for this demonstration

In [1]:
import pandas as pd

Corpus for this demonstration

In [2]:
corpus = [
    'Gradient descent is an iterative algorithm to find the minima of a cost function',
    'If this cost function is strictly convex',
    'a sum of squared differences error as in linear regression',
    'the minima found is global. For more complex cost functions',
    'there is a possibility of getting stuck in local minima',
]

### Preprocessing techniques

1. Tokenization

In [3]:
from nltk.tokenize import word_tokenize 

for doc in corpus:
    print(word_tokenize(doc))

['Gradient', 'descent', 'is', 'an', 'iterative', 'algorithm', 'to', 'find', 'the', 'minima', 'of', 'a', 'cost', 'function']
['If', 'this', 'cost', 'function', 'is', 'strictly', 'convex']
['a', 'sum', 'of', 'squared', 'differences', 'error', 'as', 'in', 'linear', 'regression']
['the', 'minima', 'found', 'is', 'global', '.', 'For', 'more', 'complex', 'cost', 'functions']
['there', 'is', 'a', 'possibility', 'of', 'getting', 'stuck', 'in', 'local', 'minima']


2. Stemming

In [4]:
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 

print("differences :", ps.stem("differences")) 
print("possibilities :", ps.stem("possibilities")) 

differences : differ
possibilities : possibl


3. Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer 
  
lm = WordNetLemmatizer() 
    
print("differences :", lm.lemmatize("differences")) 
print("possibilities :", lm.lemmatize("possibilities")) 

differences : difference
possibilities : possibility


4. Stop word removal

In [6]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
stop_words = set(stopwords.words('english')) 
  
words = word_tokenize(corpus[0]) 

cleaned_sentence = [word for word in words if not word in stop_words] 
print("Normal\n\n",words)  
print("\nStop word removed\n\n",cleaned_sentence) 

Normal

 ['Gradient', 'descent', 'is', 'an', 'iterative', 'algorithm', 'to', 'find', 'the', 'minima', 'of', 'a', 'cost', 'function']

Stop word removed

 ['Gradient', 'descent', 'iterative', 'algorithm', 'find', 'minima', 'cost', 'function']


### Feature extraction techniques

1. Count vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


count_vectorizer = CountVectorizer()
count_vectorized_corpus = count_vectorizer.fit_transform(corpus)

pd.DataFrame(count_vectorized_corpus.toarray(),columns = count_vectorizer.get_feature_names())

Unnamed: 0,algorithm,an,as,complex,convex,cost,descent,differences,error,find,...,possibility,regression,squared,strictly,stuck,sum,the,there,this,to
0,1,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,1,0,0,1,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0


2. Hashing vectorizer

In [8]:
from sklearn.feature_extraction.text import HashingVectorizer

hashing_vectorizer = HashingVectorizer(n_features=2**4)
hashing_vectorized_corpus = hashing_vectorizer.fit_transform(corpus)

pd.DataFrame(hashing_vectorized_corpus.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,-0.301511,0.301511,0.0,0.0,0.0,0.301511,0.0,-0.301511,-0.301511,0.301511,0.0,0.603023,0.0,-0.301511,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.603023,0.0,0.0,0.0,0.301511,0.0,-0.301511,0.301511,0.603023,0.0
2,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0
3,0.353553,0.0,0.0,0.353553,0.0,0.0,0.353553,0.353553,0.0,0.0,-0.353553,0.0,0.353553,0.353553,-0.353553,0.0
4,0.0,0.666667,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,-0.333333,0.333333,0.333333,0.0,0.0


3. Tf-idf count vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorized_corpus = tfidf_vectorizer.fit_transform(corpus)

pd.DataFrame(tfidf_vectorized_corpus.toarray(),columns = tfidf_vectorizer.get_feature_names())

Unnamed: 0,algorithm,an,as,complex,convex,cost,descent,differences,error,find,...,possibility,regression,squared,strictly,stuck,sum,the,there,this,to
0,0.316786,0.316786,0.0,0.0,0.0,0.212155,0.316786,0.0,0.0,0.316786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.255581,0.0,0.0,0.316786
1,0.0,0.0,0.0,0.0,0.429662,0.28775,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.429662,0.0,0.0,0.0,0.0,0.429662,0.0
2,0.0,0.0,0.351377,0.0,0.0,0.0,0.0,0.351377,0.351377,0.0,...,0.0,0.351377,0.351377,0.0,0.0,0.351377,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.356567,0.0,0.238797,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.287676,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.381653,0.0,0.0,0.0,0.381653,0.0,0.0,0.381653,0.0,0.0
