In [1]:
# Dataset

corpus = [
    'Windows has been around since the mid-1990s.',
    'Windows distribution include the windows kernel.',
    'Windows is one of the most prominent open-source software.'
]

corpus

['Windows has been around since the mid-1990s.',
 'Windows distribution include the windows kernel.',
 'Windows is one of the most prominent open-source software.']

In [2]:
# Bag of Words model dengan CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_X = vectorizer.fit_transform(corpus).todense() #objek vectorizer utk menerapkan method fit_transform trhdp corpus dataset, hasilnya akan dikonversikan ke dlm suatu array 
vectorized_X

# Method todense ini akan mengkonversikan hasil fit_transform dari objek vectorizer menjadi array 2 dimensi

matrix([[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]],
       dtype=int64)

In [3]:
vectorizer.get_feature_names()

# Nilai pada array tidak hanya nol dan satu, melainkan setiap nilai tersebut merepresentasikan jumlah kemunculan token/kata tertentu pada kalimat

['1990s',
 'around',
 'been',
 'distribution',
 'has',
 'include',
 'is',
 'kernel',
 'mid',
 'most',
 'of',
 'one',
 'open',
 'prominent',
 'since',
 'software',
 'source',
 'the',
 'windows']

In [6]:
# Euclidean Distance untuk mengukur kedekatan/jarak antar dokumen (vector)

from sklearn.metrics.pairwise import euclidean_distances

for i in range(len(vectorized_X)):
    for j in range(i, len(vectorized_X)):
        if i == j:
            continue
        jarak = euclidean_distances(vectorized_X[i], vectorized_X[j])
        print(f'Jarak dokumen {i+1} dan {j+1}: {jarak}')

Jarak dokumen 1 dan 2: [[3.16227766]]
Jarak dokumen 1 dan 3: [[3.74165739]]
Jarak dokumen 2 dan 3: [[3.46410162]]


In [7]:
# Stop Word Filtering pada text

# Stop Word Filtering menyederhanakan representasi text dengan mengabaikan beberapa kata seperti determiners
corpus

['Windows has been around since the mid-1990s.',
 'Windows distribution include the windows kernel.',
 'Windows is one of the most prominent open-source software.']

In [9]:
# Stop Word Filtering dengan CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words = 'english')
vectorized_X = vectorizer.fit_transform(corpus).todense() #objek vectorizer utk menerapkan method fit_transform trhdp corpus dataset, hasilnya akan dikonversikan ke dlm suatu array 
vectorized_X

matrix([[1, 0, 0, 0, 1, 0, 0, 0, 0, 1],
        [0, 1, 1, 1, 0, 0, 0, 0, 0, 2],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], dtype=int64)

In [10]:
vectorizer.get_feature_names()

# Nilai pada array tidak hanya nol dan satu, melainkan setiap nilai tersebut merepresentasikan jumlah kemunculan token/kata tertentu pada kalimat

['1990s',
 'distribution',
 'include',
 'kernel',
 'mid',
 'open',
 'prominent',
 'software',
 'source',
 'windows']