In [71]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

## Bag of words

In [2]:
text = ['it is a text1', 
        'it is a text2', 
        'it is a text3', 
        'that is a text4 and this is a text5']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data :  ['it is a text1', 'it is a text2', 'it is a text3', 'that is a text4 and this is a text5']


In [3]:
bow_converter = CountVectorizer()
bow_converter.fit(text)
words = bow_converter.get_feature_names()
print("Words of Bag-of-words : ", words)

Words of Bag-of-words :  ['and', 'is', 'it', 'text1', 'text2', 'text3', 'text4', 'text5', 'that', 'this']


In [4]:
features = bow_converter.transform(text).toarray()
print(features)

frequency_matrix = pd.DataFrame(features, index=text, 
                                columns=bow_converter.get_feature_names())
frequency_matrix

[[0 1 1 1 0 0 0 0 0 0]
 [0 1 1 0 1 0 0 0 0 0]
 [0 1 1 0 0 1 0 0 0 0]
 [1 2 0 0 0 0 1 1 1 1]]


Unnamed: 0,and,is,it,text1,text2,text3,text4,text5,that,this
it is a text1,0,1,1,1,0,0,0,0,0,0
it is a text2,0,1,1,0,1,0,0,0,0,0
it is a text3,0,1,1,0,0,1,0,0,0,0
that is a text4 and this is a text5,1,2,0,0,0,0,1,1,1,1


In [5]:
t = 1/3


## Bag of N-grams

In [6]:
text = ['it is a text1', 
        'it is a text2', 
        'it is a text3', 
        'that is a text4 and this is a text5']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data :  ['it is a text1', 'it is a text2', 'it is a text3', 'that is a text4 and this is a text5']


In [7]:
bigram_converter = CountVectorizer(ngram_range=(2,2))
bigram_converter.fit(text)
bigrams = bigram_converter.get_feature_names()
print("Words of Bag-of-words : ",bigrams)

Words of Bag-of-words :  ['and this', 'is text1', 'is text2', 'is text3', 'is text4', 'is text5', 'it is', 'text4 and', 'that is', 'this is']


In [8]:
features = bigram_converter.transform(text).toarray()
frequency_matrix = pd.DataFrame(features, index=text, columns=bigrams)
frequency_matrix

Unnamed: 0,and this,is text1,is text2,is text3,is text4,is text5,it is,text4 and,that is,this is
it is a text1,0,1,0,0,0,0,1,0,0,0
it is a text2,0,0,1,0,0,0,1,0,0,0
it is a text3,0,0,0,1,0,0,1,0,0,0
that is a text4 and this is a text5,1,0,0,0,1,1,0,1,1,1


## Tf-Ids (Term frequency–Inverse document frequency)

<img src="img/o.png" width="600">

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
text = ['it is a text1', 
        'it is a text2', 
        'it is a text3', 
        'that is a text4 and this is a text5']

tfidf = TfidfVectorizer()
tfidf.fit(text)

tf = tfidf.get_feature_names()
tf

['and',
 'is',
 'it',
 'text1',
 'text2',
 'text3',
 'text4',
 'text5',
 'that',
 'this']

In [10]:
features = tfidf.transform(text).toarray()
frequency_matrix = pd.DataFrame(features, index=text, columns=tf)
frequency_matrix

Unnamed: 0,and,is,it,text1,text2,text3,text4,text5,that,this
it is a text1,0.0,0.402642,0.492489,0.771579,0.0,0.0,0.0,0.0,0.0,0.0
it is a text2,0.0,0.402642,0.492489,0.0,0.771579,0.0,0.0,0.0,0.0,0.0
it is a text3,0.0,0.402642,0.492489,0.0,0.0,0.771579,0.0,0.0,0.0,0.0
that is a text4 and this is a text5,0.405245,0.422947,0.0,0.0,0.0,0.0,0.405245,0.405245,0.405245,0.405245


## Word2Vec

<img src="img/w.png" width="550">

## Sentiment Analysis


Link : https://drive.google.com/file/d/1dmVsO2BduSwF23yDl8eNlxmB2VBCLAbZ/view?usp=sharing