In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("corpus.csv")
df.head()

Unnamed: 0,Content,Domain
0,"Tourism is travel for pleasure, and the commer...",Tourism
1,A household consists of one or more persons wh...,Household
2,Algorithms and data structures are central to ...,Computer Science
3,"Health care, or healthcare, is the improvement...",Medical
4,Artificial intelligence (AI) is intelligence e...,Computer Science


In [4]:
df.Domain.value_counts()

Domain
Tourism             4
Household           4
Computer Science    4
Medical             4
Name: count, dtype: int64

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# without preprocessing
w = CountVectorizer()
transformed_output_wdf = w.fit_transform(df.Content)

In [8]:
wt = TfidfVectorizer()
transformed_output_widf = wt.fit_transform(df.Content)
print("Vocabulary without preprocessing: \n")
print(wt.vocabulary_)

Vocabulary without preprocessing: 

{'tourism': 547, 'is': 302, 'travel': 552, 'for': 218, 'pleasure': 409, 'and': 46, 'the': 531, 'commercial': 115, 'activity': 25, 'of': 372, 'providing': 444, 'supporting': 516, 'such': 514, 'un': 562, 'defines': 157, 'more': 351, 'generally': 231, 'in': 282, 'terms': 524, 'which': 596, 'go': 237, 'beyond': 78, 'common': 116, 'perception': 397, 'as': 58, 'being': 77, 'limited': 320, 'to': 543, 'holiday': 263, 'only': 377, 'people': 396, 'travelling': 555, 'staying': 505, 'places': 406, 'outside': 388, 'their': 532, 'usual': 572, 'environment': 199, 'not': 367, 'than': 528, 'one': 376, 'consecutive': 130, 'year': 605, 'leisure': 317, 'less': 318, '24': 9, 'hours': 266, 'business': 92, 'other': 384, 'purposes': 448, 'can': 97, 'be': 70, 'domestic': 178, 'within': 600, 'traveller': 554, 'own': 390, 'country': 142, 'or': 381, 'international': 298, 'has': 247, 'both': 84, 'incoming': 284, 'outgoing': 387, 'implications': 279, 'on': 375, 'balance': 68, 'pa

In [9]:
print("Tf matrix for without preprocessing our document:\n")
print(transformed_output_wdf.toarray())

Tf matrix for without preprocessing our document:

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 2 0 0]]


In [10]:
print("Tfidf matrix without preprocessing our document: \n")
print(transformed_output_widf.toarray())

Tfidf matrix without preprocessing our document: 

[[0.         0.         0.         ... 0.         0.         0.10092953]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.06894275 0.06894275 0.06894275 ... 0.10744827 0.         0.        ]]


In [11]:
dataframe = pd.DataFrame(transformed_output_widf.toarray(), columns= wt.get_feature_names_out())
dataframe_sum = dataframe.sum(axis = 0)
terms_wpp = dataframe_sum.sort_values(ascending=False).head(5)

In [12]:
# with preprocessing
#preprocessing function
def preprocess(text):
    type(text)
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(output)
rows,columns = df.shape
for i in range(rows):
    df.at[i,'Content'] = preprocess(df.Content[i])

In [13]:
pdf = CountVectorizer()
transformed_output_df = pdf.fit_transform(df.Content)

In [14]:
pt = TfidfVectorizer()
transformed_output_idf = pt.fit_transform(df.Content)
print("Vocabulary withpreprocessing: \n")
print(pt.vocabulary_)

Vocabulary withpreprocessing: 

{'tourism': 428, 'travel': 433, 'pleasure': 320, 'commercial': 89, 'activity': 24, 'provide': 350, 'support': 410, 'un': 439, 'define': 127, 'generally': 187, 'term': 416, 'common': 90, 'perception': 310, 'limit': 257, 'holiday': 213, 'people': 309, 'stay': 404, 'place': 317, 'outside': 303, 'usual': 447, 'environment': 161, 'consecutive': 102, 'year': 469, 'leisure': 255, '24': 9, 'hour': 216, 'business': 70, 'purposes': 354, 'domestic': 145, 'traveller': 434, 'country': 113, 'international': 242, 'incoming': 228, 'outgoing': 302, 'implication': 224, 'balance': 55, 'payment': 308, 'household': 218, 'consist': 104, 'person': 312, 'live': 260, 'dwelling': 149, 'single': 390, 'family': 171, 'type': 437, 'group': 198, 'basic': 56, 'unit': 443, 'analysis': 37, 'social': 396, 'microeconomic': 276, 'government': 194, 'model': 279, 'important': 225, 'economic': 152, 'inheritance': 235, 'algorithm': 33, 'datum': 124, 'structure': 408, 'central': 79, 'computer': 

In [15]:
print("Tf matrix after preprocessing our document: \n")
print(transformed_output_df.toarray())

Tf matrix after preprocessing our document: 

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 2 0 0]]


In [16]:
print("Tfidf matrix after preprocessing our document: \n")
print(transformed_output_idf.toarray())

Tfidf matrix after preprocessing our document: 

[[0.         0.         0.         ... 0.         0.         0.12052996]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.07759623 0.07759623 0.07759623 ... 0.12093484 0.         0.        ]]


In [17]:
idf = pd.DataFrame(transformed_output_idf.toarray(), columns= pt.get_feature_names_out())
idf_sum = idf.sum(axis = 0)
terms_p = idf_sum.sort_values(ascending=False).head(5)

In [18]:
print("Top 5 words without processing \n",terms_wpp)
print("Top 5 words with preprocessing \n",terms_p)

Top 5 words without processing 
 the    2.290336
and    2.135853
of     1.911805
in     1.174611
is     1.104042
dtype: float64
Top 5 words with preprocessing 
 computer    1.241585
house       1.163772
tourism     1.038174
india       1.029987
health      0.821475
dtype: float64


In [19]:
query1 = ["Health is a fundamental right. The World Health Organization (WHO) defines health as a state of complete physical, mental, and social well-being. It's a fundamental right for everyone, regardless of race, religion, political belief, economic status, or social condition."]

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
arr = cosine_similarity(pt.transform(query1),transformed_output_idf)

In [22]:
print(arr)

[[0.         0.14758044 0.         0.51225715 0.         0.01557631
  0.0831142  0.1648227  0.05692049 0.08557492 0.         0.
  0.         0.         0.         0.02542188]]


In [23]:
import numpy as np

In [24]:
print(query1 ," = ",df.Domain[np.argmax(arr)])

["Health is a fundamental right. The World Health Organization (WHO) defines health as a state of complete physical, mental, and social well-being. It's a fundamental right for everyone, regardless of race, religion, political belief, economic status, or social condition."]  =  Medical


In [25]:
query2 = ["Himachal Pradesh is a very beautiful place for tourists to visit"]
arr = cosine_similarity(pt.transform(query2),transformed_output_idf)
print(arr)

[[0.04421035 0.         0.         0.         0.         0.22340756
  0.         0.         0.         0.50804356 0.         0.
  0.         0.         0.         0.05692455]]


In [26]:
print(query2," = ",df.Domain[np.argmax(arr)])

['Himachal Pradesh is a very beautiful place for tourists to visit']  =  Tourism
