In [116]:
import pandas as pd
import sklearn
import numpy as np
import nltk
import re
import time
import codecs

In [117]:
idea = pd.read_csv('./test-data.csv')

In [118]:
pd.set_option('display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [119]:
idea.head()

Unnamed: 0,_id,idea_description
0,5e8f0b90bab69f6010a9d925,This action thriller movie is going to be a hit
1,5e8f0c03bab69f6010a9d926,Distributed scalable idea sharing platform
2,5e90ef1a7dbb6e6d3721cbef,Anti-malarial drug could be effective in COVID-19. It is not a potential drug but could be tried and tested
3,5e90f7b90907d96d3e8cd1a6,Binary search algorithm could be effective in expediating the testing of COVID-19.
4,5eb392cdc6b564c77d167a20,Israeli lab confirms breakthrough with virus antibody\t but says months till cure


In [120]:
idea['idea_description']

0      This action thriller movie is going to be a hit                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
1      Distributed scalable idea sharing platform                                                                                                                                                                         

In [121]:
def clean_text(text):
    stop_words = ['\x0c', '\n']
    for i in stop_words:
        text.replace(i, ' ')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

In [122]:
idea['idea_description'] = idea['idea_description'].apply(clean_text)

In [124]:
idea.head(10)['idea_description']

0    this action thriller movie is going to be a hit                                                                                                                         
1    distributed scalable idea sharing platform                                                                                                                              
2    anti malarial drug could be effective in covid it is not a potential drug but could be tried and tested                                                                 
3    binary search algorithm could be effective in expediating the testing of covid                                                                                          
4    israeli lab confirms breakthrough with virus antibody but says months till cure                                                                                         
5    ai as a service will allow us to simply feed in our own data and pay for the algorithms or compute resources as we use them  

In [125]:
print ("idea description: " , idea['idea_description'][30])

idea description:  there are two steering mechanism generally prevalent with regards to ground vehicles differential or skid steering and ackermann steering mechanism in any case one of the greatest issues with a differential steering is that it squanders energy by sliding the wheels over the ground the ackermann steering regularly found in autos enables the wheels to turn about a similar turning focus the wheels don t slip along the side amid a turn hence no energy is squandered while turning in this project robot is designed utilizing ackermann steering mechanism in this mini robotic vehicle project 


In [126]:
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.stem.porter.PorterStemmer()
    return [i for i in [stemmer.stem(t) for t in tokens] if len(i) > 2]

In [127]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/surabhisinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create vectorizer for Abstracts, max_df is set to 0.5, we only want
# to include terms that appear in less tha 50% of the documents (i.e. rare terms)
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=0, max_features=200000,
               stop_words='english', use_idf=True, tokenizer=tokenize_and_stem)

In [129]:
tfidf_weights = abs_tfidf_vectorizer.fit_transform(idea['idea_description'])

In [130]:
tfidf_features = abs_tfidf_vectorizer.get_feature_names()

In [131]:
def get_top_features(rownum, weights, features, top_k=10):
    weight_vec = weights.toarray()[rownum,:]
    top_idx = np.argsort(weight_vec)[::-1][:top_k]
    return [features[i] for i in top_idx]

In [132]:
get_top_features(1, tfidf_weights, tfidf_features)

['scalabl',
 'distribut',
 'share',
 'platform',
 'idea',
 'farm',
 'far',
 'fantasi',
 'fan',
 'famous']

In [108]:

# Build model to return 5 closest neighbors
from sklearn.neighbors import NearestNeighbors

# Create the k-NN model using k=5
nn_description = NearestNeighbors(n_neighbors=10, algorithm='auto')

# Fit the models to the TF-IDF weights matrix
nn_fitted_description = nn_description.fit(tfidf_weights)

In [109]:
def find_nearest_papers(row, kNNmodel, tfidf_weights, tfidf_features, papers):
    keywords = get_top_features(row, tfidf_weights, tfidf_features)
    dist,idx = kNNmodel.kneighbors(tfidf_weights[row,:])
    idx = list(idx[0])
    return {'data':data.loc[idx], 'keywords':keywords}

In [110]:
find_nearest_papers(1, nn_fitted_description, tfidf_weights, tfidf_features, data)['data']

Unnamed: 0,_id,idea_description
1,5e8f0c03bab69f6010a9d926,distributed scalable idea sharing platform
145,5ed44335c5928e61279a00a3,a door step bike servicing platform and application which will use technology for the convenience of two wheeler owners by providing them a transparent connection with high quality vehicle maintenance providers the platform can provide assisted door step pick up and drop an in built inventory management system that enables reduction of waiting time smarter stock allocation an order management system etc
150,5ed44381c5928e61279a00a8,a real time car sharing app allows the user to enlist their car and put in the destination as to where they are going another user on the app who is going the same way can put in their destination and find the users with cars who are going in the same direction in the end they can split costs
7,5eca4791cafd7f1fdf4b7218,the idea to create a system that can take any kind of waste and automatically sort it into biodegradable and non biodegradable waste
89,5ed31be85f719d5b7ba4ce5b,idea is to control the dc motor speed and direction using raspberry pi you can control the speed of a fan according to the weather outside this will also improve your knowledge of robotics
134,5ed42a4ec5928e61279a0098,one of the hardest things about dating or being in a long term relationship is deciding where to go on dates build a tool that scours restaurant reviews event calendars and other data for date idea suggestions
110,5ed42759c5928e61279a0080,drones are useful in a lot of ways they can carry small packages and be controlled from a long distance drones are used a lot in making cinematic videos and photography use your ideas to build a drone using raspberry pi and python
107,5ed4272bc5928e61279a007d,a good idea project is to make a door locking system that will only open when the authorized person tries to open the door you need to implement a facial recognition system in python and then if the person exists in the database then we give him entry inside the door
147,5ed44353c5928e61279a00a5,seeing a missing person or a wanted criminals face once on the television is difficult to remember plus there is also a chance of meeting a stranger who turns out to be a criminal that you don t know about an application idea is such that the app will alert you of criminals in your area so that you can save a life as well as help in catching a lawbreaker
71,5ed315595f719d5b7ba4ce49,the idea was simple burn plant derived biofuels such as maize sugar and corn in engines then the co released would be offset by the amount of gas absorbed by the plants when they grew it appeared you could drive oil free on green gold sadly this was a cruel mirage and when we got close it became clear that the displacement impact of a biofuel boom would lead to the ploughing up of virgin habitats giant monocultures land rights disputes and the truly terrible conundrum of whether to feed the world or power the rich world s private and expanding car fleet in a dramatic fall from grace biofuels crashed and burned however proponents of second generation biofuels say it s not the idea that is at fault just the choice of biomass this time around they advocate using the whole plant and converting waste materials such as cornstalks and leftover sugar cane fibres into cellulosic ethanol that will then power our lives this is reliant on a fledgling process cracking or splitting cellulose into simple hydrocarbons in effect breaking down complex chains and liberating sugars


In [111]:

id = "5ed44335c5928e61279a00a3"
data[data['_id']==id]

Unnamed: 0,_id,idea_description
145,5ed44335c5928e61279a00a3,a door step bike servicing platform and application which will use technology for the convenience of two wheeler owners by providing them a transparent connection with high quality vehicle maintenance providers the platform can provide assisted door step pick up and drop an in built inventory management system that enables reduction of waiting time smarter stock allocation an order management system etc


  """Entry point for launching an IPython kernel.


In [112]:
print(data[data['_id']==id]['idea_description'])

145    a door step bike servicing platform and application which will use technology for the convenience of two wheeler owners by providing them a transparent connection with high quality vehicle maintenance providers the platform can provide assisted door step pick up and drop an in built inventory management system that enables reduction of waiting time smarter stock allocation an order management system etc 
Name: idea_description, dtype: object


In [114]:
nearest_data = find_nearest_papers(145, nn_fitted_description, tfidf_weights, tfidf_features, data)

In [115]:
for i in nearest_data['data']['idea_description']: print ("idea_description: "+i+"\n")

idea_description: a door step bike servicing platform and application which will use technology for the convenience of two wheeler owners by providing them a transparent connection with high quality vehicle maintenance providers the platform can provide assisted door step pick up and drop an in built inventory management system that enables reduction of waiting time smarter stock allocation an order management system etc 

idea_description: here we propose a fully automatic garage door opener project system to achieve the design and fabrication of an automated garage door opener system we use a large screw with fabricated door belt pulley connecting rods fixtures mounts motor and supporting frame proposed system will use a fabricate mini door mounted on the threaded screw to efficiently transfer motor power for achieving radial motion of the door we use a pulley and belt arrangement in order to drive the screw by transferring motor power to the screw the screw rotation moves the door i