In [1]:
import os
import sys
import sklearn as sl
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#Параметр min_df (минимальная частота в документе) определяет, как CountVectorizer должен обходиться с редко встречающимися словами.
vectorizer = CountVectorizer(min_df=1)
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [3]:
content = ["How to format my hard disk", "Hard disk format problems"]
X=vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [36]:
print(X.toarray().transpose())
print(content[:2])

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]
['How to format my hard disk', 'Hard disk format problems']


In [12]:
ddepot = "/Users/SergeySedykh/Documents/GitHub/ST/DATA"
posts = [open(os.path.join(ddepot, f)).read() for f in os.listdir(ddepot)]

from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(min_df=1, stop_words='english')

X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape

#отражаем кол-во предложений и кол-во слов:
#print("#samples: %d, features: %d" %(num_samples, num_features))

print("#samples: %d, features: %d" %(num_samples, num_features))

#samples: 5, features: 18


In [13]:
print(vectorizer.get_feature_names())

['actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'toy']


In [37]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

print(new_post_vec)

  (0, 4)	1
  (0, 5)	1


In [15]:
print(new_post_vec.toarray())

[[0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]]


In [16]:
def dist_raw (v1,v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

print (dist_raw)

<function dist_raw at 0x115782d90>


In [17]:
best_dist = sys.maxsize
best_i = None

for i in range (0, num_samples):
    post = posts[i]
    if post == new_post:
        continue
    post_vec= X_train.getrow(i)
    d=dist_raw(post_vec, new_post_vec)
    print("=== Post %i with dist = %.2f: %s" % (i, d, post))
    if d<best_dist:
            best_dist=d
            best_i = i
print("Best post is %i, with dist = %.2f"%(best_i, best_dist))

=== Post 0 with dist = 3.16: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 1.73: Imaging databases provide storage capabilities.
=== Post 2 with dist = 1.73: Most imaging databases save images permanently.

=== Post 3 with dist = 1.41: Imaging databases store data.
=== Post 4 with dist = 5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3, with dist = 1.41


In [18]:
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

[[0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0]]
[[0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 3 0 0]]


In [19]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [20]:
# Как видим, одних лишь счетчиков слов недостаточно. Необходимо нормировать векторы на едичную длину. 
# В функции dist_ raw мы будем вычислять расстояние не между исходными, а между нормированными векторами (стр. 76):

best_dist = sys.maxsize
best_i = None

for i in range (0, num_samples):
    post = posts[i]
    if post == new_post:
        continue
    post_vec= X_train.getrow(i)
    d=dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist = %.2f: %s" % (i, d, post))
    if d<best_dist:
            best_dist=d
            best_i = i
print("Best post is %i, with dist = %.2f"%(best_i, best_dist))

=== Post 0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.86: Most imaging databases save images permanently.

=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3, with dist = 0.77


In [21]:
#DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(ddepot)))
if not os.path.exists(DATA_DIR):
    print("Uh, we were expecting a data directory, which contains the toy data")
    sys.exit(1)

#print (DATA_DIR)

In [22]:
import nltk

In [23]:
s=nltk.stem.SnowballStemmer('english')
s.stem('graphics')

'graphic'

In [39]:
#Совместное использование векторизатора и стеммера из библиотеки NLTK 

ddepot = "/Users/SergeySedykh/Documents/GitHub/ST/DATA"
posts = [open(os.path.join(ddepot, f)).read() for f in os.listdir(ddepot)]

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer=StemmedCountVectorizer(min_df=1, stop_words = 'english')

X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape

#print(vectorizer_)
print(vectorizer.get_feature_names())
print("#samples: %d, features: %d" %(num_samples, num_features))

#X_train = vectorizer.fit_transform(posts)
#num_samples, num_features = X_train.shape
#отражаем кол-во предложений и кол-во слов:


['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'save', 'storag', 'store', 'stuff', 'toy']
#samples: 5, features: 17


In [40]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [41]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

print(new_post_vec)

  (0, 4)	1
  (0, 5)	1


In [43]:
best_dist = sys.maxsize
best_i = None

for i in range (0, num_samples):
    post = posts[i]
    if post == new_post:
        continue
    post_vec= X_train.getrow(i)
    d=dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist = %.2f: %s" % (i, d, post))
    if d<best_dist:
            best_dist=d
            best_i = i
print("Best post is %i, with dist = %.2f"%(best_i, best_dist))

=== Post 0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.63: Most imaging databases save images permanently.

=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 2, with dist = 0.63
