In [1]:
import numpy as np
import pandas as pd
#import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, download_plotlyjs
init_notebook_mode(connected = True)
sns.set(style='white')
sns.set(style='whitegrid', color_codes=True)
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("../data/sop.csv")
df.head

<bound method NDFrame.head of                                                   sop
0   My goal is to combine my background in physics...
1   The doctoral program will provide me with an o...
2   Growing up, I always loved math and sciences. ...
3   A desire to extend my knowledge and an enthusi...
4   As a child born and raised in Delhi, India, I ...
5   I am applying to Harvards doctoral program in ...
6   When I came to college I wanted to be a doctor...
7   I want to pursue a Ph.D. in Computer Science, ...
8   Having worked as a teacher at Liaocheng Teache...
9   In order to best contribute to the leading que...
10  Certainly, my academic journey has not been wi...
11  During my early studies I had no clue about ar...
12  I am interested in the STRATFOR summer interns...
13  Currently, I am working with USC Professor Bar...
14  I did my undergraduate research on how familia...
15  In the current time, new breakthroughs in tech...
16  It is amazing to me what can be done through t..

In [3]:
import nltk
def lemmatize_string(s):
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    list2 = nltk.word_tokenize(s)
    lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])
    return lemmatized_string 

In [4]:
l=[]
vectorizer = TfidfVectorizer(stop_words='english')
for i in range(24):
    l.append(lemmatize_string(df.iloc[i]['sop'].lower()))
X = vectorizer.fit_transform(l)


In [5]:
true_k = 5
with open('sop_model_picle','rb') as f:
    model=pickle.load(f)

In [6]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
order_centroids

array([[  56, 2505, 2446, ..., 1550, 1556,    0],
       [1968, 1501, 1453, ..., 1521, 1522,    0],
       [ 489, 2059, 1833, ..., 1508, 1511, 1287],
       [ 398, 2562, 1968, ..., 1617, 1618,    0],
       [2010, 1968,  978, ..., 1684, 1683,    0]], dtype=int64)

In [7]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :50]:
        print(' %s' % terms[ind])
    print("\n")

Cluster 0:
 academic
 wa
 university
 architecture
 linguistics
 college
 believe
 landscape
 student
 opportunity
 experience
 work
 education
 field
 skill
 master
 research
 grade
 higher
 chinese
 stratfor
 study
 test
 networking
 activity
 urban
 program
 department
 english
 company
 professional
 east
 lack
 number
 landscaping
 design
 pursuit
 bluetooth
 year
 network
 school
 internship
 confidence
 help
 intelligence
 given
 affair
 deal
 ha
 editor


Cluster 1:
 research
 model
 mechanism
 physic
 theory
 dr
 brain
 game
 mathematical
 simulation
 economics
 cell
 project
 neuron
 understand
 physical
 neuroscience
 mathematics
 response
 neural
 wa
 optimal
 usc
 rieke
 process
 pi
 information
 capacity
 computation
 role
 zero
 player
 empirical
 input
 memory
 gene
 proposal
 work
 group
 biological
 result
 shea
 morphology
 fairhall
 brown
 especially
 biophysical
 technique
 dynamic
 insight


Cluster 2:
 computer
 science
 programming
 technology
 wa
 field
 progra

In [8]:
def predict_cluster(s):    
    X = vectorizer.transform([lemmatize_string(s)])
    predicted = model.predict(X)
    return predicted[0]

In [9]:
def get_distance(s,order_centroids,true_k):
    X = vectorizer.transform([lemmatize_string(s)])
    distance=np.zeros((X.shape[0], true_k))
    for k in range(true_k):
        row_norm = np.linalg.norm(X - order_centroids[k, :], axis=1)
        distance[:, k] = np.square(row_norm)
    return distance.tolist()[0]

In [10]:
def get_score(s,order_centroids,true_k):
    d=get_distance(s,order_centroids,true_k)
    s0=21/d[0]
    s1=26/d[1]
    s2=31/d[2]
    s3=14/d[3]
    s4=24/d[4]
    final_score=(s0+s1+s2+s3+s4)/5
    return final_score

In [12]:
def get_final_score(s,order_centroids,true_k):
    ll=[]
    scaler = MinMaxScaler()
    for i in range(24):
        x=get_score(l[i],order_centroids,true_k)
        ll.append(x)
    ll.append(get_score(s,order_centroids,true_k))
    scaled = scaler.fit_transform(np.array(ll).reshape(-1, 1))

    return scaled[len(ll)-1]*5


In [18]:
get_final_score(l[14],order_centroids,true_k)

array([1.32513151])