In [1]:
import numpy as np
import pandas as pd
#import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, download_plotlyjs
init_notebook_mode(connected = True)
sns.set(style='white')
sns.set(style='whitegrid', color_codes=True)
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("../data/sop.csv")
df.head

<bound method NDFrame.head of                                                   sop
0   My goal is to combine my background in physics...
1   The doctoral program will provide me with an o...
2   Growing up, I always loved math and sciences. ...
3   A desire to extend my knowledge and an enthusi...
4   As a child born and raised in Delhi, India, I ...
5   I am applying to Harvards doctoral program in ...
6   When I came to college I wanted to be a doctor...
7   I want to pursue a Ph.D. in Computer Science, ...
8   Having worked as a teacher at Liaocheng Teache...
9   In order to best contribute to the leading que...
10  Certainly, my academic journey has not been wi...
11  During my early studies I had no clue about ar...
12  I am interested in the STRATFOR summer interns...
13  Currently, I am working with USC Professor Bar...
14  I did my undergraduate research on how familia...
15  In the current time, new breakthroughs in tech...
16  It is amazing to me what can be done through t..

In [3]:
import nltk
def lemmatize_string(s):
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    list2 = nltk.word_tokenize(s)
    lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])
    return lemmatized_string 

In [4]:
l=[]
vectorizer = TfidfVectorizer(stop_words='english')
for i in range(24):
    l.append(lemmatize_string(df.iloc[i]['sop'].lower()))
X = vectorizer.fit_transform(l)


In [5]:
true_k = 5
with open('sop_model_picle','rb') as f:
    model=pickle.load(f)

In [6]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
order_centroids

array([[  56, 2505, 2446, ..., 1550, 1556,    0],
       [1968, 1501, 1453, ..., 1521, 1522,    0],
       [ 489, 2059, 1833, ..., 1508, 1511, 1287],
       [ 398, 2562, 1968, ..., 1617, 1618,    0],
       [2010, 1968,  978, ..., 1684, 1683,    0]], dtype=int64)

In [7]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :50]:
        print(' %s' % terms[ind])
    print("\n")

Cluster 0:
 academic
 wa
 university
 architecture
 linguistics
 college
 believe
 landscape
 student
 opportunity
 experience
 work
 education
 field
 skill
 master
 research
 grade
 higher
 chinese
 stratfor
 study
 test
 networking
 activity
 urban
 program
 department
 english
 company
 professional
 east
 lack
 number
 landscaping
 design
 pursuit
 bluetooth
 year
 network
 school
 internship
 confidence
 help
 intelligence
 given
 affair
 deal
 ha
 editor


Cluster 1:
 research
 model
 mechanism
 physic
 theory
 dr
 brain
 game
 mathematical
 simulation
 economics
 cell
 project
 neuron
 understand
 physical
 neuroscience
 mathematics
 response
 neural
 wa
 optimal
 usc
 rieke
 process
 pi
 information
 capacity
 computation
 role
 zero
 player
 empirical
 input
 memory
 gene
 proposal
 work
 group
 biological
 result
 shea
 morphology
 fairhall
 brown
 especially
 biophysical
 technique
 dynamic
 insight


Cluster 2:
 computer
 science
 programming
 technology
 wa
 field
 progra

In [8]:
def predict_cluster(s):    
    X = vectorizer.transform([lemmatize_string(s)])
    predicted = model.predict(X)
    return predicted[0]

In [9]:
def get_distance(s,order_centroids,true_k):
    X = vectorizer.transform([lemmatize_string(s)])
    distance=np.zeros((X.shape[0], true_k))
    for k in range(true_k):
        row_norm = np.linalg.norm(X - order_centroids[k, :], axis=1)
        distance[:, k] = np.square(row_norm)
    return distance.tolist()[0]

In [10]:
def get_score(s,order_centroids,true_k):
    d=get_distance(s,order_centroids,true_k)
    s0=21/d[0]
    s1=26/d[1]
    s2=31/d[2]
    s3=14/d[3]
    s4=24/d[4]
    final_score=(s0+s1+s2+s3+s4)/5
    return final_score

In [11]:
def get_final_score(s,order_centroids,true_k):
    ll=[]
    scaler = MinMaxScaler()
    for i in range(24):
        x=get_score(l[i],order_centroids,true_k)
        ll.append(x)
    ll.append(get_score(s,order_centroids,true_k))
    scaled = scaler.fit_transform(np.array(ll).reshape(-1, 1))

    return scaled[len(ll)-1]*5


In [15]:
sop_str='After a long day at work, you go to the shopping mart, pick a few items and reach the billingcounter. You now stand facing a queue of probably a “million people” who are all waiting toget their checkout. To the untrained eye and a traditionalist, this is a norm that’s beenfollowed for some decades. But, is there no way to avoid this? On digging deeper for thetechnical article I was writing, I find Amazon Go, a chain of partially automated grocerystores by Amazon that follow a no billing counter policy. This experience opened my eyes toComputer Vision and Networks – the technologies that are the crux of Amazon Go.I applying to Rochester Institute of Technology to pursue a Master’s degree in ComputerScience. My research interests lie in the field of computer vision, artificial intelligence andcomputer networks. My career goal is to contribute to research through academia and believethat graduate school will train me for this. While my undergraduate curriculum has helped mehave a strong computer engineering foundation, there is still a gap between the knowledge Ihave and the knowledge and experience I need to do good work. While my final goal is topursue a Ph.D. degree, a Masters program will help me bridge that gap and offer me moreclarity about where my interests lie.For my sophomore and junior year, I interned at the Indian Institute of Technology Bombay(IIT-B), Codebreak, a financial startup and Indian Institute of Management,Ahmedabad(IIM-A). I also attended two summer schools at the International Institute ofInformation Technology, Hyderabad(IIIT-H) where I got an opportunity to learn and interactwith people in the research community. These experiences helped me realize that I am themost content and satisfied when I get an opportunity to do research. This also motivated meto get exposure in an international setting for which I applied to many programs. My effortswere finally paid off when I was selected for the summer research scholarship programmeorganized by the University of Auckland, New Zealand and will join the university thisDecember for a duration of 10 weeks. My job is to use Artificial Intelligence to identifyearthquake damage in unreinforced masonry buildings with limited emergency services anddeploy help accordingly. This programme will help me boost my research pursuit further.My interest in computer vision first piqued when I wrote that article on Amazon Go. Thisinterest further intensified when I got the opportunity to attend a lecture taken by Prof. SPArun from Indian Institute of Science where he spoke about Moravecs paradox. According tothis paradox, it is the seemingly easy day to day activities humans perform, that are thehardest to compute. This is particularly true for vision-based functions like pose estimationand object detection that are performed easily by the biological system but arecomputationally quite complex.My bachelor thesis is titled “3D reconstruction of rooms using 2D images.” The aim of thethesis is to reconstruct the 3D model of the room from a single 2D panorama image andimplement object detection to increase the contextual information provided to the modelbased on a deep learning approach. Computer vision being an interdisciplinary field gave methe opportunity to work on such a project that has multi-domain applications in indoornavigation, virtual reality, and entertainment. This is a new avenue that displays indoorscenarios more intuitively. I am currently working on a systemization of knowledge (SoK)paper for the same.One of the most interesting subject I studied as a junior was computer networks. To augmentthis exposure with hands-on experience, I applied to several institutes for an internship inetworking. I finally landed one at the Information Systems Department of the IndianInstitute of Management, Ahmedabad, one of the best educational institutes of the country. Iwas one of the first students of my university to get a technical research internship at IIM-A. Iworked on developing a wireless file sharing system based on scalable broadcastalgorithm(SBA). The project was built on OMNet++ using INET as the key framework. Fothe first time, I worked on a large codebase that required me to master multiple files at a time.I got to learn a lot in these two months from reviewing papers to reading books to understandthe working of Ad-Hoc On-Demand Distance Vector (AODV). It was the first time I planned,implemented, tested and debugged all by myself. This experience has been invaluable to myfuture endeavors. It also helped me gained the skills necessary to work further in computernetworks. I am currently assisting Prof. Pratik Kanani on his project that uses differentprobabilistic broadcast methods to reduce network overheadLast summer, I was one of the 16 selected of 250 applicants for the ERPNext Summer ofCode. Using node.js, I worked on the accounting, inventory and purchase modules. Theprogram helped me understand how complex open source software is developed. Along withthis, I was also a part of the Stanford Crowd Research Initiative. The initiative aims atmaking research available to all, by crowdsourcing students and researchers from across theglobe to make technical presentations and talks on various topics. I worked towards providinga comprehensive understanding of the two research papers. I was a contributor to thetechnical presentation ‘ FaceNet: A unified embedding for face recognition and clustering’and ‘Why Does Deep and Cheap Learning Work So Well?’Beyond academics, I was a part of “Learn IT, Girl,” a mentorship program by the Anita BorgInstitute for Women and Technology. I was a JavaScript mentor and my mentee and I workedon a project titled ‘What is privilege?’ It helped people to be more understanding andcompassionate towards each other, fighting stereotypes and biases. It was a great programthat enabled me to mentor fellow females interested in coding. It was a moment of utter joywhen my mentee told me she is applying for the role of a UX developer. I am also the head ofthe newly formed college committee called “Unicode” where I mentored 5 sophomorestudents to develop an application for students to find people with common interests. I alsogave a lecture on front-end development for freshers and sophomores.Along with this, I am also the writer of ‘Computers, Papers, and Everything,’ a blog where Ireview technical papers on a regular basis. The motivation behind this blog was for me toexplain research papers as simply as possible. This also facilitated me with an opportunity tobe acquainted with the interdisciplinary work happening across various domains. Forexample, I was unaware of how machine learning algorithms can be used to make databasequerying more efficient. However, the paper “A Machine Learning Approach to DatabasesIndexes” explains this approach very well. I also realized that while thoroughlyunderstanding a paper is critical, it is time-consuming. It is sometimes better to read themsuperficially and get exposure to a wider range of techniques.My undergraduate program has provided me with a lot of exposure and inculcated an interestin research. I am certain about pursuing a research career in Computer Science. RochesterInstitute of Technology(RIT) will help me launch my future efforts in the right direction anis an ideal choice for me owing to its flexibility. The courses offered at RIT like CSCI-631,CSCI-651 and CSCI-630 will provide me with a stronger background knowledge in the fieldof computer vision, computer networks, and intelligent systems while keeping open thepossibility to learn new things and contribute to your research. I am fully aware of the factthat pursuing a Master’s degree shall require utmost dedication, diligence, and intelligence. Iam confident about my ability to work hard, manage time and accept challengesconstructively. Hence, I would like to be a master’s student at your university.'

In [19]:
sop_rating=get_final_score(sop_str,order_centroids,true_k)
print("SOP rating: "+str(sop_rating.tolist()[0]))

SOP rating: 2.720829156751279
