### Example 01

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = ["This is a good cat","This is a bad day"]

# Bigram configure : ngram_range
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
bow_tfidf = vectorizer.fit_transform(corpus)

print(vectorizer.vocabulary_)

pd.DataFrame(bow_tfidf.todense(),columns=vectorizer.get_feature_names())

{'good': 4, 'cat': 2, 'good cat': 5, 'bad': 0, 'day': 3, 'bad day': 1}


Unnamed: 0,bad,bad day,cat,day,good,good cat
0,0.0,0.0,0.57735,0.0,0.57735,0.57735
1,0.57735,0.57735,0.0,0.57735,0.0,0.0


### Example 02

In [3]:
import pandas as pd

train = [("Thanks for an excellent report", "pos"),
         ("Your service is very quick and fast", "pos"),
        ("I am pleased with your service", "pos"),
        ("I did not know i was diabetic until you gave me this report", "neg"),
        ("Service - Little slow, probably because too many people.", "neg"),
        ("The place is not easy to locate", "neg"),
        ("The place is very easy to locate", "pos"),
        ("Not satisfied will take a second opinion", "neg"),
        ("No human contact everything is so robotic here", "neg")]

df = pd.DataFrame(train,columns=['review','sentiment'])

df.head()

Unnamed: 0,review,sentiment
0,Thanks for an excellent report,pos
1,Your service is very quick and fast,pos
2,I am pleased with your service,pos
3,I did not know i was diabetic until you gave m...,neg
4,"Service - Little slow, probably because too ma...",neg


In [4]:
set(df['sentiment'])

{'neg', 'pos'}

In [5]:
df.shape

(9, 2)

In [6]:
df['sentiment'].value_counts()

neg    5
pos    4
Name: sentiment, dtype: int64

In [7]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocessData(review):
    
    # tokenize words
    review = word_tokenize(review)
    
    # lower the text
    review = [x.lower() for x in review]
    
    review = ' '.join([x for x in review])
    
    return review


In [8]:
df['cleaned_review'] = df['review'].apply(preprocessData)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(df['cleaned_review'])

featurenames= vectorizer.get_feature_names()

dense = vectors.todense()

denselist = dense.tolist()

In [10]:
dff = pd.DataFrame(denselist, columns=featurenames)

dff.head(5)

Unnamed: 0,am,an,and,because,contact,diabetic,did,easy,everything,excellent,...,this,to,too,until,very,was,will,with,you,your
0,0.0,0.460611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.460611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.430848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.363901,0.0,0.0,0.0,0.0,0.363901
2,0.484919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.484919,0.0,0.40957
3,0.0,0.0,0.0,0.0,0.0,0.312307,0.312307,0.0,0.0,0.0,...,0.312307,0.0,0.0,0.312307,0.0,0.312307,0.0,0.0,0.312307,0.0
4,0.0,0.0,0.0,0.364195,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.364195,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
dff['thanks'][0]

0.4606106279676078

In [12]:
for w in df['cleaned_review'][0].split():
    print("{0}:{1}".format(w,dff[w][0]))

thanks:0.4606106279676078
for:0.4606106279676078
an:0.4606106279676078
excellent:0.4606106279676078
report:0.3890390695202013
