### 1. TF-IDF from Scratch

In [1]:
import numpy as np
import pandas as pd

text_data = ["good movie","bad movie","good bad movie"]
# tf : Finding how many time each word appear in a sentence
# idf : How many times that word appear in the whole corpus / no of sentences

In [2]:
# Sentences and word tokenization
word_data = [sentence.split(' ') for sentence in text_data]

In [3]:
# Finding the Vocab
vocab = list(set([word for sentence in word_data for word in sentence]))

In [4]:
# Finding term frequency : How many times particular word is appearning in sentence/ Total no of words in that sentence
tf_data = []
for sentence in word_data:
    tf_sentence = []
    for word in vocab:
        tf_sentence.append(sentence.count(word)/len(sentence))
    tf_data.append(tf_sentence)

In [5]:
# Finding inverse Document Frequency
n_documents = len(text_data)
idf_data = []

for word in vocab:
    n_appearances = 0
    for sentence in word_data:
        if word in sentence:
            n_appearances += 1
    idf = np.log(n_documents/n_appearances)
    idf_data.append(idf)

In [None]:
# Finding TF-IDF for each sentence (Term Frequency - Inverse Document Frequency)
tfidf_data = []
for tf_sentence in tf_data:
    tfidf_sentence = []
    for tf,idf in zip(tf_sentence,idf_data):
        tfidf_sentence.append(tf*idf)
    tf_data.append(tfidf_sentence)
    
df = pd.DataFrame(tfidf_data,columns = vocab)

### 2. TF-IDF with Sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = ["good movie","bad movie","good bad movie"]

vec = TfidfVectorizer()
vec.fit(text_data)

df = pd.DataFrame(vec.transform(text_data).toarray(),columns=vocab)
df['full_sent'] = text_data

df

In [None]:
vec.get_feature_names()