<a href="https://colab.research.google.com/github/mtsilimos/Source-code/blob/main/Section_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# SECTION 5

# Import libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [5]:
# Word count

# Assuming 'corpus' should contain the text data.
# Replace this with your actual text data.
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

cv=CountVectorizer()
word_count_vector=cv.fit_transform(corpus)
tokens = cv.get_feature_names_out()
print(word_count_vector.shape)
print(tokens)
print(len(tokens))
print(word_count_vector.toarray())

(4, 9)
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
9
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [6]:
# Word count in a data frame

doc_names = ['Doc{:d}'.format(index) for index, _ in enumerate(word_count_vector)]
df = pd.DataFrame(data=word_count_vector.toarray(), index=doc_names, columns=tokens)


In [7]:
# Calculate Term Frequency

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), index=doc_names,
columns=tfidf_vectorizer.get_feature_names_out())
df


Unnamed: 0,and,document,first,is,one,second,the,third,this
Doc0,0.0,0.2,0.2,0.2,0.0,0.0,0.2,0.0,0.2
Doc1,0.0,0.333333,0.0,0.166667,0.0,0.166667,0.166667,0.0,0.166667
Doc2,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.166667,0.166667,0.166667
Doc3,0.0,0.2,0.2,0.2,0.0,0.0,0.2,0.0,0.2


In [8]:
# IDF weights

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

df_idf = pd.DataFrame(tfidf_transformer.idf_, index=tokens,columns=["idf_weights"])
df_idf

Unnamed: 0,idf_weights
and,1.916291
document,1.223144
first,1.510826
is,1.0
one,1.916291
second,1.916291
the,1.0
third,1.916291
this,1.0


In [9]:
# TF-IDF Term Frequency-Inverse Document Frequency

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['The cats eat fish',
'Leo catches fish every day',
'Once a day I eat fish'
]


In [10]:
vec = TfidfVectorizer()
tf_idf =  vec.fit_transform(corpus)
print(pd.DataFrame(tf_idf.toarray(), columns=vec.get_feature_names_out()))


    catches      cats       day       eat     every      fish       leo  \
0  0.000000  0.584483  0.000000  0.444514  0.000000  0.345205  0.000000   
1  0.504611  0.000000  0.383770  0.000000  0.504611  0.298032  0.504611   
2  0.000000  0.000000  0.480458  0.480458  0.000000  0.373119  0.000000   

       once       the  
0  0.000000  0.584483  
1  0.000000  0.000000  
2  0.631745  0.000000  


In [11]:
# Cosine Similarity between two TF-IDF vectors

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['The President communicates nicely with all his colleagues',
'The President has a very close relationship with the staff']


In [12]:
vec = TfidfVectorizer()
tf_idf =  vec.fit_transform(corpus)
print(pd.DataFrame(tf_idf.toarray(), columns=vec.get_feature_names_out()))


        all     close  colleagues  communicates       has       his    nicely  \
0  0.391668  0.000000    0.391668      0.391668  0.000000  0.391668  0.391668   
1  0.000000  0.352728    0.000000      0.000000  0.352728  0.000000  0.000000   

   president  relationship     staff       the      very      with  
0   0.278675      0.000000  0.000000  0.278675  0.000000  0.278675  
1   0.250969      0.352728  0.352728  0.501938  0.352728  0.250969  


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tf_idf, tf_idf)
print(cosine_sim)


[[1.         0.27975559]
 [0.27975559 1.        ]]


In [14]:
import plotly.express as px
z = [[1,         0.27975559],
 [0.27975559, 1        ]]

fig = px.imshow(z, text_auto=True, labels=dict(x="Cosine similarity", y="Text", color="Productivity"),
                x=['text1', 'text2'],
                y=['text1', 'text2'])
fig.show()


In [15]:
# Homework

# Task 1a)
# Apply text preprocessing (noise removal & normalisation) techniques in the following text.
# Task 1b)
# Calculate the TF-IDF values.

text1 = '''So many people will be trying to leave at the same time, there will not be flights, the roads will be blocked, are you going to be able to get fuel for your car, your
passport, is there going to be cash in the banking machines?'''
text2 = '''Now some people have arrived, but they need to provide proof of his British citizenship to get an emergency travel document, a more urgent alternative to a
passport.'''
text3 = '''If we take the advice at face value, it means the UK and also the US government are worried that Russian action is imminent - that makes everybody want to take
their passport and get out.'''


# Task 2
# Calculate and plot the cosine similarity between text1, text2, text3 and text4.

text1 = 'The President communicates nicely with all his colleagues'
text2 = 'The President has a very close relationship with the staff'
text3 = 'The President does not want to have a meeting with the colleagues'
text4 = 'She would like to see the President'

