<a href="https://colab.research.google.com/github/SohaHussain/Machine-Learning/blob/main/Lexical%20Processing/TF_IDF_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
docs = ["Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline",
             "The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.",
             "Vapour, Bangalore has the best view in Bangalore."]
print(docs)

['Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline', 'The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.', 'Vapour, Bangalore has the best view in Bangalore.']


In [3]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def preprocess(doc):
    doc = doc.lower()
    words = word_tokenize(doc)
    words = [word for word in words if word not in stopwords.words("english")]     
    document = " ".join(words)
    
    return document

In [9]:
docs = [preprocess(doc) for doc in docs]
print(docs)

['vapour , bangalore really great terrace seating awesome view bangalore skyline', 'beer vapour , bangalore amazing . favorites wheat beer ale beer .', 'vapour , bangalore best view bangalore .']


In [10]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(docs)
print(tfidf)

  (0, 10)	0.34663478992044555
  (0, 13)	0.2636246924033099
  (0, 2)	0.34663478992044555
  (0, 9)	0.34663478992044555
  (0, 11)	0.34663478992044555
  (0, 7)	0.34663478992044555
  (0, 8)	0.34663478992044555
  (0, 3)	0.40945618183743365
  (0, 12)	0.20472809091871683
  (1, 0)	0.2701947410011521
  (1, 14)	0.2701947410011521
  (1, 6)	0.2701947410011521
  (1, 1)	0.2701947410011521
  (1, 4)	0.8105842230034562
  (1, 3)	0.15958136664279549
  (1, 12)	0.15958136664279549
  (2, 5)	0.5486117771118656
  (2, 13)	0.4172333972107692
  (2, 3)	0.6480379064629606
  (2, 12)	0.3240189532314803


In [12]:
pd.DataFrame(tfidf.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,ale,amazing,awesome,bangalore,beer,best,favorites,great,really,seating,skyline,terrace,vapour,view,wheat
0,0.0,0.0,0.346635,0.409456,0.0,0.0,0.0,0.346635,0.346635,0.346635,0.346635,0.346635,0.204728,0.263625,0.0
1,0.270195,0.270195,0.0,0.159581,0.810584,0.0,0.270195,0.0,0.0,0.0,0.0,0.0,0.159581,0.0,0.270195
2,0.0,0.0,0.0,0.648038,0.0,0.548612,0.0,0.0,0.0,0.0,0.0,0.0,0.324019,0.417233,0.0


### tfidf model on spam dataset

In [13]:
pd.set_option('max_colwidth', 100)
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [14]:
spam = spam.iloc[0:50,:]

In [15]:
msgs = [msg for msg in spam.message]
print(msgs)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv", 'Even my brother is not like to speak with me. They treat me like aids patent.', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 

In [16]:
msgs = [preprocess(msg) for msg in msgs]
print(msgs)

['go jurong point , crazy .. available bugis n great world la e buffet ... cine got amore wat ...', 'ok lar ... joking wif u oni ...', "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question ( std txt rate ) & c 's apply 08452810075over18 's", 'u dun say early hor ... u c already say ...', "nah n't think goes usf , lives around though", "freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv", 'even brother like speak . treat like aids patent .', "per request 'melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . press * 9 copy friends callertune", 'winner ! ! valued network customer selected receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hours .', 'mobile 11 months ? u r entitled update latest colour mobiles camera free ! call mobile update co free 08002986030', "'m gon na home soon n't want talk stuff anymore tonight , k ? 've cried enoug

In [18]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(msgs)

In [19]:
pd.DataFrame(tfidf.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,000,07732584351,08000930705,08002986030,08452810075over18,09061701461,100,11,12,150p,...,worried,www,xuhui,xxx,xxxmobilemovieclub,yeah,yes,yummy,yup,ú1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.198284,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.256871,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.230701,0.0,0.0,0.230701,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.230794,0.0,0.0,0.0,0.230794,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
