In [1]:
import numpy
import scipy
import scipy.sparse
import sklearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import sklearn.metrics.pairwise

# Data cleaning

The first step of nearly any machine learning analysis project is data cleaning. This is done in order to allow a larger variety of models to work with a predictable input, such that exceptions (in this case special characters such as quotation marks, '[comma]' and others) will not cause any disturbance in the model. The following code loads the data, 'cleans' it, and afterwards sets the entire cleaned data in an array. Comments are added in the code to improve interpretability.

### Cleaning

In [2]:
## Set an empty list variable

descriptions = []

with open('descriptions.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        text = text.replace("[comma]"," ")                        ## Replace [commas] with empty space
        for ch in text:
            if ch < "0" or (ch < "a" and ch > "9") or ch > "z":   ## The cleaning operation happens here, remove all special characters
                text = text.replace(ch," ")
        text = ' '.join(text.split())                             ## Remove double spacing from sentences
        descriptions.append(text)
dataSet = numpy.array(descriptions)
f.close()

### Train set

In [3]:
descriptions2 = []

with open('coco_val.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        descriptions2.append(text.split('\n'))

trainSet = numpy.array(descriptions2)
f.close()

print(trainSet[1])
print(trainSet[2])
print(trainSet.shape)

['a young man holding an umbrella next to a herd of cattle' '']
['a young boy barefoot holding an umbrella touching the horn of a cow' '']
(25000, 2)


The data is now cleaned according to a list of lists. Some basic information about the cleaned array will be provided in the following code.

In [4]:
print('The size of our data set: ', dataSet.size)
print('The dimension of our dataset are: ', dataSet.shape)
print('\n')
print('-- 0th element of our dataSet --', '\n', dataSet[0])
print('\n')
print('-- 1st element of our dataSet --', '\n', dataSet[1])

The size of our data set:  1480
The dimension of our dataset are:  (1480,)


-- 0th element of our dataSet -- 
 round face short and overweight likes to wear jeans and sweaters drinks wine at dinner short liberal overweight short hair eats at whole foods does not work our very much


-- 1st element of our dataSet -- 
 jug ears mustache and beard and long sideburns stylish hair no laugh lines eyes are clear no drugs or alcohol confident a little overweight from double chin


### Representation

Since the input vector now is 'clean', different representations can be made, which in turn can then be trained to obtain accuracy measures of classification. Firstly, countVectorizer by scikitLearn (which counts all the instances of words) will run on our cleaned dataset. Afterwards TfIdf will run, in order the have the Term frequency, inverse document frequency (which will essentially remove non-informative words suchs as: 'the', 'and', 'a')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
TfIdf_dataSet = vectorizer.fit_transform(dataSet)
print("What our Tf-Idf looks like: ")
print()
print(TfIdf_dataSet[1:2])

What our Tf-Idf looks like: 

  (0, 228)	0.0999230235196548
  (0, 2737)	0.16083047896043273
  (0, 1776)	0.06254106376813398
  (0, 2119)	0.3161753448886334
  (0, 1247)	0.14464356086569088
  (0, 2538)	0.19064057405466286
  (0, 419)	0.18076247495377387
  (0, 2292)	0.1401975901762004
  (0, 3445)	0.21120111824703844
  (0, 3749)	0.2492839547283478
  (0, 2604)	0.3123113475808705
  (0, 2188)	0.2417063139284673
  (0, 2262)	0.264108063599575
  (0, 1420)	0.07045478691841404
  (0, 286)	0.11292286212515924
  (0, 772)	0.21289776024949109
  (0, 1217)	0.2324320311261342
  (0, 2694)	0.1235469355267728
  (0, 180)	0.27051486760744353
  (0, 878)	0.20649095624162253
  (0, 2276)	0.14173141580413545
  (0, 1605)	0.1515561961580996
  (0, 1191)	0.3161753448886334
  (0, 740)	0.1644468421153148


### Cosine similarity

Now we can safely compute the distance between each document.

In [6]:
cosineSimilarity = sklearn.metrics.pairwise.cosine_similarity(TfIdf_dataSet)
print(cosineSimilarity)
numpy.argmax(cosineSimilarity, axis = 0)

[[1.         0.0607874  0.01502397 ... 0.04558553 0.06715282 0.09317791]
 [0.0607874  1.         0.07117968 ... 0.06126724 0.0548427  0.0650882 ]
 [0.01502397 0.07117968 1.         ... 0.09200355 0.05217812 0.03591779]
 ...
 [0.04558553 0.06126724 0.09200355 ... 1.         0.06750285 0.05996048]
 [0.06715282 0.0548427  0.05217812 ... 0.06750285 1.         0.21531977]
 [0.09317791 0.0650882  0.03591779 ... 0.05996048 0.21531977 1.        ]]


array([   0,    1,    2, ..., 1477, 1478, 1479])

### KMeans clustering

In [7]:
KMeans = sklearn.cluster.KMeans(n_clusters=296)

In [8]:
KmeansFit = KMeans.fit(TfIdf_dataSet)
labels = KMeans.predict(TfIdf_dataSet)
C = KMeans.cluster_centers_
print(C)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.19625855 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
