# Vectorisation of Text Data

The process of converting or transforming a data set into a set of Vectors is called Vectorization.

In [1]:
# import libraries
import pandas as pd
import sklearn as sk
import math 

In [2]:
# enter two sentences for the test
sent1 = "Hej med dig."
sent2 = "Hej med dig med mig"

In [3]:
# parse the sentences in words
sent1 = sent1.split(" ")
sent2 = sent2.split(" ")

In [4]:
# join the sets of words to remove duplications
all= set(sent1).union(set(sent2))
print(all)

{'dig', 'Hej', 'mig', 'dig.', 'med'}


In [5]:
# corpus size
len(all)

5

Check and remember where each word appears.
Start from two empty dictionaries, one for each sentence.
Add the words that appear in the sentences, every time they repeat.

In [6]:
# create dict with the words are the keys and the numbers of their appearance as values
dict1 = dict.fromkeys(all, 0) 
for word in sent1:
    dict1[word]+=1
dict1       

{'dig': 0, 'Hej': 1, 'mig': 0, 'dig.': 1, 'med': 1}

In [7]:
dict2 = dict.fromkeys(all, 0)  
for word in sent2:
    dict2[word]+=1
dict2 

{'dig': 1, 'Hej': 1, 'mig': 1, 'dig.': 0, 'med': 2}

In [8]:
# collect the dictionaires in a data frame
df = pd.DataFrame([dict1, dict2])
df

Unnamed: 0,dig,Hej,mig,dig.,med
0,0,1,0,1,1
1,1,1,1,0,2


## Similarity of Two Vectors

In [9]:
# Calculate dot product of two vectors, divide it by the magnitudes to find the cos(angle between them)
# Use the result as a correlation coefficient 
from collections import Counter

def cosine(vector1, vector2):
     # calculate nominator as a dot product
     intersect = set(vector1.keys()) & set(vector2.keys())
     numerator = sum([vector1[x] * vector2[x] for x in intersect])
    
     # calculate the denominator 
     sum1 = sum([vector1[x] ** 2 for x in list(vector1.keys())])
     sum2 = sum([vector2[x] ** 2 for x in list(vector2.keys())])
    
     denominator = math.sqrt(sum1) * math.sqrt(sum2)
     if not denominator:
         return 0.0
     else:
         return float(numerator)/denominator


In [10]:
# calculate the correlation
corr = cosine(dict1, dict2)
print("Similarity: ", corr)

Similarity:  0.6546536707079772


## Importance of Each Word

In [11]:
# a function to compute the TD frequency of appearance of a word in documents
def computeTF(dicto, doc):
    tfDict = {}
    corpus = len(doc) # number of all words
    for word, wcount in dicto.items():
        tfDict[word] = wcount/float(corpus) # calculete the proportion
    return(tfDict)

In [12]:
# call the function for both sets
tf1 = computeTF(dict1, sent1)
tf2 = computeTF(dict2, sent2)

In [13]:
# store into dataframe
tf = pd.DataFrame([tf1, tf2])
tf

Unnamed: 0,dig,Hej,mig,dig.,med
0,0.0,0.333333,0.0,0.333333,0.333333
1,0.2,0.2,0.2,0.0,0.4


In [14]:
# IDF -  inverse of the document frequency which measures the informativeness of term t
#  lower occurance - higher importance of the word
# idf(t) = N/df
# to avoid div by zero: idf(t) = log(N/(df + 1))

def computeIDF(docList):
    idf = {}
    N = len(docList)
    
    idf = dict.fromkeys(docList[0].keys(), 0)
    for word, wcount in idf.items():
        idf[word] = math.log10(N/(float(wcount) + 1))
        
    return(idf)

In [15]:
#inputing our sentences in the log file
idfs = computeIDF([dict1, dict2])

In [16]:
idfs

{'dig': 0.3010299956639812,
 'Hej': 0.3010299956639812,
 'mig': 0.3010299956639812,
 'dig.': 0.3010299956639812,
 'med': 0.3010299956639812}

In [17]:
# calculate tf-idf as a measure for the importance of a word
# tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, wcount in tf.items():
        tfidf[word] = wcount*idfs[word]
    return(tfidf)


In [18]:
#running our two sentences through the IDF:
idf1 = computeTFIDF(tf1, idfs)
idf2 = computeTFIDF(tf2, idfs)


In [19]:
# store in a dataframe
idf= pd.DataFrame([idf1, idf2])
idf

Unnamed: 0,dig,Hej,mig,dig.,med
0,0.0,0.100343,0.0,0.100343,0.100343
1,0.060206,0.060206,0.060206,0.0,0.120412


# Assignment

In [39]:
A = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"
B = "Mr. Trump says President Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"
C = "post elections vladimir putin became president of russia president putin had served as the prime minister earlier in his political career"

A1 = A.split(" ") 
B1 = B.split(" ") 
C1 = C.split(" ") 

# join the sets of words to remove duplications
all= set(A1).union(set(B1).union(C1))
print(all)

{'lost', 'as', 'prime', 'no', 'a', 'friends', 'is', 'elections', 'served', 'career', 'witchhunt', 'some', 'his', 'winning', 'of', 'the', 'do', 'claimed', 'putin', 'in', 'Trump', 'vladimir', 'President', 'to', 'who', 'post', 'friend', 'had', 'he', 'Though', 'He', 'it', 'outcome.', 'became', 'by', 'election', 'earlier', 'minister', 'political', 'Mr.', 'parties.', 'says', 'friends,', 'russia', 'after', 'nothing', 'was', 'with', 'election.', 'president', 'support', 'interference', 'republican', 'Putin'}


In [40]:
def convertTextToVector(text):
    x = dict.fromkeys(all, 0) 
    for word in text:
        x[word]+=1
    return x

In [45]:
def compareVectors(A1, B1, C1):
    corrAB = cosine(convertTextToVector(A1), convertTextToVector(B1))
    print("Similarity: ", corrAB)
    
    corrAC = cosine(convertTextToVector(A1), convertTextToVector(C1))
    print("Similarity: ", corrAC)
    
    corrBC = cosine(convertTextToVector(B1), convertTextToVector(C1))
    print("Similarity: ", corrBC)
    
    suggestion = ""
    
    return suggestion

In [46]:
suggestion = compareVectors(A1, B1, C1)
print("Text A er skrevet af ")
print("Text B er skrevet af ")
print("Text C er skrevet af ")

Similarity:  0.40050093945740706
Similarity:  0.2645751311064591
Similarity:  0.15894388284780525
Text A er skrevet af 
Text B er skrevet af 
Text C er skrevet af 
