In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

documents = []

with open('descriptions.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        text = text.replace("[comma]"," ")                        ## Replace [commas] with empty space
        for ch in text:
            if ch < "0" or (ch < "a" and ch > "9") or ch > "z":   ## The cleaning operation happens here, remove all special characters
                text = text.replace(ch," ")
        text = ' '.join(text.split())                             ## Remove double spacing from sentences
        documents.append(text)


vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

true_k = 296
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=296, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [2]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

Y = vectorizer.transform([documents[0]])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

Top terms per cluster:
Cluster 0:
 practicality
 carefree
 faced
 standing
 tied
 baby
 important
 simple
 professional
 20s
Cluster 1:
 nerd
 ugly
 probably
 bit
 meek
 looks
 shy
 head
 good
 paying
Cluster 2:
 appears
 weight
 dark
 healthy
 hair
 highlights
 short
 boxing
 cars
 brown
Cluster 3:
 kind
 gamestop
 clerk
 gone
 blocky
 cheerful
 dimples
 stay
 teeth
 bob
Cluster 4:
 working
 chances
 aside
 hard
 job
 decided
 longish
 started
 afford
 income
Cluster 5:
 nice
 people
 slender
 likes
 friendly
 sideburns
 slight
 new
 face
 smile
Cluster 6:
 long
 fingers
 child
 works
 arms
 tall
 legs
 circles
 person
 hunting
Cluster 7:
 black
 pimples
 color
 sharpe
 dot
 eye
 body
 brow
 skin
 lean
Cluster 8:
 health
 education
 collar
 average
 fair
 worker
 mid
 skin
 healthy
 hair
Cluster 9:
 good
 sun
 like
 confident
 physical
 environmentally
 aware
 hygiene
 shaven
 smooth
Cluster 10:
 weekend
 reddish
 probably
 think
 really
 person
 video
 games
 kind
 sacrifice
Cluster 

 financially
 brown
 cares
 caucasian
 eyes
 cut
 family
Cluster 134:
 attitude
 covered
 green
 looks
 20
 ears
 white
 direct
 potential
 female
Cluster 135:
 likes
 wears
 time
 tries
 clothing
 average
 lot
 play
 friends
 regular
Cluster 136:
 probably
 person
 fit
 skinny
 taller
 average
 likes
 work
 studies
 shady
Cluster 137:
 like
 games
 large
 black
 play
 board
 weight
 video
 television
 drive
Cluster 138:
 freckles
 skin
 fair
 red
 blue
 student
 eyes
 hair
 school
 friends
Cluster 139:
 worries
 wear
 highlighted
 early
 20
 attraction
 lot
 dull
 hope
 future
Cluster 140:
 heterosexual
 highschool
 height
 average
 quiet
 dark
 smart
 effort
 teenager
 young
Cluster 141:
 public
 transportation
 live
 work
 job
 average
 time
 height
 brown
 ear
Cluster 142:
 person
 feature
 belong
 based
 list
 type
 warm
 overweight
 guessing
 soft
Cluster 143:
 beautiful
 person
 nice
 excellent
 face
 peace
 comes
 great
 upset
 eyes
Cluster 144:
 criminal
 difficult
 inch
 mind

 haircut
 weekends
 parties
Cluster 270:
 want
 efforts
 rewarded
 judged
 execute
 performance
 individual
 based
 wears
 jacket
Cluster 271:
 sleeve
 watching
 inches
 mma
 players
 knee
 stopped
 feet
 took
 tattoo
Cluster 272:
 positioned
 unmistakable
 dumb
 folk
 follows
 hairs
 cause
 bold
 order
 center
Cluster 273:
 individual
 based
 middle
 skin
 slightly
 condition
 forming
 forehead
 probably
 class
Cluster 274:
 help
 round
 face
 friendly
 socially
 red
 complexion
 white
 likes
 link
Cluster 275:
 possibly
 year
 old
 appearance
 female
 dirty
 built
 mother
 cares
 employed
Cluster 276:
 african
 american
 curly
 sports
 wide
 braided
 jewish
 student
 steadfast
 theater
Cluster 277:
 sissy
 puberty
 picked
 reached
 considered
 gets
 slight
 school
 build
 person
Cluster 278:
 hooters
 asks
 eyesbrows
 chiseled
 curve
 questions
 hates
 football
 chin
 people
Cluster 279:
 unfortunate
 american
 probably
 big
 looking
 vaguely
 develop
 entertainments
 improved
 preju