In [1]:
import pandas as pd
import sklearn as sk
import math 

In [3]:
sent_1 = "Our aim is to develop a good work culture among students, a culture where students from various technical backgrounds come together to teach, guide and collaborate with each other on various projects and grow together."

sent_2 = "Keeping in mind the interest of the IT professionals and computer enthusiasts, CSI works towards making the profession an area of choice amongst all sections of the society. The promotion of Information Technology as a profession is the top priority of CSI today. To fulfill this objective, the CSI regularly organizes conferences, conventions, lectures, projects, and awards. And at the same time, it also ensures that regular training and skill updating are organized for the future IT professionals."

# split so each word have their own string
first_sent = sent_1.split(" ")
second_sent = sent_2.split(" ")

# join them to remove common duplicate words
total= set(first_sent).union(set(second_sent))

print(total)

{'skill', 'other', 'organizes', 'top', 'are', 'also', 'that', 'collaborate', 'society.', 'among', 'at', 'professionals.', 'the', 'towards', 'for', 'priority', 'students,', 'The', 'IT', 'students', 'various', 'teach,', 'ensures', 'guide', 'backgrounds', 'training', 'this', 'computer', 'in', 'of', 'come', 'works', 'Information', 'area', 'projects,', 'aim', 'awards.', 'as', 'with', 'Keeping', 'CSI', 'To', 'an', 'fulfill', 'same', 'work', 'together', 'choice', 'updating', 'and', 'organized', 'develop', 'to', 'together.', 'enthusiasts,', 'conventions,', 'culture', 'where', 'all', 'objective,', 'promotion', 'time,', 'from', 'regularly', 'projects', 'sections', 'it', 'is', 'a', 'Technology', 'Our', 'making', 'interest', 'today.', 'on', 'lectures,', 'technical', 'each', 'profession', 'professionals', 'amongst', 'conferences,', 'future', 'good', 'mind', 'grow', 'regular', 'And'}


In [4]:
# counting the words
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)

for word in first_sent:
    wordDictA[word]+=1
    
for word in second_sent:
    wordDictB[word]+=1

In [5]:
# view result in the form of dataframe
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,skill,other,organizes,top,are,also,that,collaborate,society.,among,...,profession,professionals,amongst,"conferences,",future,good,mind,grow,regular,And
0,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
1,1,0,1,1,1,1,1,0,1,0,...,2,1,1,1,1,0,1,0,1,1


In [9]:
# TF function
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():tfDict[word] = count/float(corpusCount)
    return(tfDict)

# running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_sent)

tfSecond = computeTF(wordDictB, second_sent)

# Converting to dataframe for visualization
tf = pd.DataFrame([tfFirst, tfSecond])
tf

Unnamed: 0,skill,other,organizes,top,are,also,that,collaborate,society.,among,...,profession,professionals,amongst,"conferences,",future,good,mind,grow,regular,And
0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.028571,...,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.028571,0.0,0.0
1,0.012821,0.0,0.012821,0.012821,0.012821,0.012821,0.012821,0.0,0.012821,0.0,...,0.025641,0.012821,0.012821,0.012821,0.012821,0.0,0.012821,0.0,0.012821,0.012821


In [12]:
# removing stopwords from the list

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in wordDictA if not w in stop_words]

print('\nAfter stop word removal:\n', filtered_sentence)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

After stop word removal:
 ['skill', 'organizes', 'top', 'also', 'collaborate', 'society.', 'among', 'professionals.', 'towards', 'priority', 'students,', 'The', 'IT', 'students', 'various', 'teach,', 'ensures', 'guide', 'backgrounds', 'training', 'computer', 'come', 'works', 'Information', 'area', 'projects,', 'aim', 'awards.', 'Keeping', 'CSI', 'To', 'fulfill', 'work', 'together', 'choice', 'updating', 'organized', 'develop', 'together.', 'enthusiasts,', 'conventions,', 'culture', 'objective,', 'promotion', 'time,', 'regularly', 'projects', 'sections', 'Technology', 'Our', 'making', 'interest', 'today.', 'lectures,', 'technical', 'profession', 'professionals', 'amongst', 'conferences,', 'future', 'good', 'mind', 'grow', 'regular', 'And']


In [15]:
# IDF function
def computeIDF(docList):
    idfDict = {}
    N = len(docList)   
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items(): idfDict[word] = math.log10(N / (float(val) + 1))     
    return(idfDict)

#inputing our sentences in the log file
idfs = computeIDF([wordDictA, wordDictB])
print(idfs)

{'skill': 0.3010299956639812, 'other': 0.3010299956639812, 'organizes': 0.3010299956639812, 'top': 0.3010299956639812, 'are': 0.3010299956639812, 'also': 0.3010299956639812, 'that': 0.3010299956639812, 'collaborate': 0.3010299956639812, 'society.': 0.3010299956639812, 'among': 0.3010299956639812, 'at': 0.3010299956639812, 'professionals.': 0.3010299956639812, 'the': 0.3010299956639812, 'towards': 0.3010299956639812, 'for': 0.3010299956639812, 'priority': 0.3010299956639812, 'students,': 0.3010299956639812, 'The': 0.3010299956639812, 'IT': 0.3010299956639812, 'students': 0.3010299956639812, 'various': 0.3010299956639812, 'teach,': 0.3010299956639812, 'ensures': 0.3010299956639812, 'guide': 0.3010299956639812, 'backgrounds': 0.3010299956639812, 'training': 0.3010299956639812, 'this': 0.3010299956639812, 'computer': 0.3010299956639812, 'in': 0.3010299956639812, 'of': 0.3010299956639812, 'come': 0.3010299956639812, 'works': 0.3010299956639812, 'Information': 0.3010299956639812, 'area': 0.3

In [16]:
# calculating TF-IDF
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items(): tfidf[word] = val*idfs[word]
    return(tfidf)

#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)

#putting it in a dataframe
idf= pd.DataFrame([idfFirst, idfSecond])
print(idf)

      skill     other  organizes       top       are      also      that  \
0  0.000000  0.008601   0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.003859  0.000000   0.003859  0.003859  0.003859  0.003859  0.003859   

   collaborate  society.     among  ...  profession  professionals   amongst  \
0     0.008601  0.000000  0.008601  ...    0.000000       0.000000  0.000000   
1     0.000000  0.003859  0.000000  ...    0.007719       0.003859  0.003859   

   conferences,    future      good      mind      grow   regular       And  
0      0.000000  0.000000  0.008601  0.000000  0.008601  0.000000  0.000000  
1      0.003859  0.003859  0.000000  0.003859  0.000000  0.003859  0.003859  

[2 rows x 88 columns]


In [17]:
# using sklearn library for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# make sure all words are in lowercase

sentence_1 = "Developing a competitive culture where the students polish technical and professional attributes, gain experience and learn new skills while upgrading the already present skillset. For those fledglings who have a zeal to build a strong profile and are hunting for their Ikigai, CSI provides ample opportunities for those individuals too."
sentence_2 = "Personalized career guidance, Regular Logic and aptitude building activities, Industrial level project collaboration, Building a network with active collaborations across the globe, Periodic member exclusive conferences and seminars, Created a community for sharing skills and knowledge"

# calling the TfidfVectorizer
vectorize= TfidfVectorizer()

# fitting the model and passing our sentences 
tfidf = vectorize.fit_transform([sentence_1.lower(), sentence_2.lower()])

In [18]:
print(tfidf)

  (0, 61)	0.13915271943780658
  (0, 31)	0.13915271943780658
  (0, 40)	0.13915271943780658
  (0, 4)	0.13915271943780658
  (0, 48)	0.13915271943780658
  (0, 18)	0.13915271943780658
  (0, 30)	0.13915271943780658
  (0, 58)	0.13915271943780658
  (0, 29)	0.13915271943780658
  (0, 7)	0.13915271943780658
  (0, 46)	0.13915271943780658
  (0, 54)	0.13915271943780658
  (0, 9)	0.13915271943780658
  (0, 60)	0.13915271943780658
  (0, 67)	0.13915271943780658
  (0, 28)	0.13915271943780658
  (0, 65)	0.13915271943780658
  (0, 23)	0.13915271943780658
  (0, 59)	0.27830543887561315
  (0, 24)	0.2970249178760062
  (0, 53)	0.13915271943780658
  (0, 44)	0.13915271943780658
  (0, 3)	0.13915271943780658
  (0, 62)	0.13915271943780658
  (0, 64)	0.13915271943780658
  :	:
  (1, 21)	0.16649349332910351
  (1, 37)	0.16649349332910351
  (1, 41)	0.16649349332910351
  (1, 26)	0.16649349332910351
  (1, 0)	0.16649349332910351
  (1, 13)	0.16649349332910351
  (1, 1)	0.16649349332910351
  (1, 66)	0.16649349332910351
  (1, 38)	0