<a href="https://colab.research.google.com/github/Srijan2001/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

import nltk
import string

import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
rtest = pd.read_csv('/content/sample_data/restauranttest.bio', names=['Label-words'])          
rtrain = pd.read_csv('/content/sample_data/restauranttrain.bio', names=['Label-words'])

In [None]:
#Original restaurant dataset

print(rtrain)
print(rtest)



             Label-words
0            B-Rating\t2
1        I-Rating\tstart
2         O\trestaurants
3                O\twith
4      B-Amenity\tinside
...                  ...
70518             O\tyou
70519            O\tlook
70520             O\tfor
70521               O\ta
70522      O\trestaurant

[70523 rows x 1 columns]
            Label-words
0                  O\ta
1        B-Rating\tfour
2        I-Rating\tstar
3         O\trestaurant
4      B-Location\twith
...                 ...
14251             O\tme
14252           O\twith
14253           O\tsome
14254     B-Dish\tonion
14255     I-Dish\trings

[14256 rows x 1 columns]


In [None]:
# Adding two new columns to the existing dataframe and dropping the original column to make data easier to read
def modify(df):

  df[['Label', 'Words']] = df['Label-words'].apply(lambda x: pd.Series(str(x).split("\t")))
  df.drop('Label-words', axis=1, inplace=True)   
  return df


In [None]:
rtrain = modify(rtrain)
rtest = modify(rtest)

In [None]:
rtrain

Unnamed: 0,Label,Words
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside
...,...,...
70518,O,you
70519,O,look
70520,O,for
70521,O,a


In [None]:
rtest

Unnamed: 0,Label,Words
0,O,a
1,B-Rating,four
2,I-Rating,star
3,O,restaurant
4,B-Location,with
...,...,...
14251,O,me
14252,O,with
14253,O,some
14254,B-Dish,onion


In [None]:
#Function for displaying the number of words associated with each label and the number of labels

def calculate(df):

  arr = {df['Label'].iloc[0]: 0}
  temp = ''

  for i in range(df.shape[0]):
    flg = 0
    temp = df['Label'].iloc[i]

    for key in arr:
      if key == df['Label'].iloc[i]:      
        arr[key] = arr[key] + 1           #Counts the number of words associated with each label
        flg = 1
        break
  
    if flg==0:                            #If the label hasn't been encountered before then add it to the dictionary and increment counter
      arr[temp] = 0
  
  print(arr)
  print('The number of distinct labels are ' + str(len(arr)))         #Number of distinct labels can be found by finding the length of the dict



In [None]:
calculate(rtrain)                 #Stats for restaurant training dataset

{'B-Rating': 1070, 'I-Rating': 526, 'O': 43667, 'B-Amenity': 2540, 'I-Amenity': 2675, 'B-Location': 3816, 'I-Location': 3657, 'B-Restaurant_Name': 1900, 'I-Restaurant_Name': 1667, 'B-Price': 729, 'B-Hours': 989, 'I-Hours': 1282, 'B-Dish': 1474, 'I-Dish': 766, 'B-Cuisine': 2838, 'I-Price': 282, 'I-Cuisine': 629}
The number of distinct labels are 17


In [None]:
calculate(rtest)                 #Stats for restaurant testing dataset

{'O': 8659, 'B-Rating': 200, 'I-Rating': 124, 'B-Location': 811, 'I-Location': 787, 'B-Amenity': 532, 'B-Cuisine': 531, 'B-Hours': 211, 'I-Hours': 294, 'B-Price': 170, 'I-Cuisine': 134, 'B-Dish': 287, 'I-Amenity': 523, 'I-Price': 65, 'B-Restaurant_Name': 401, 'I-Restaurant_Name': 391, 'I-Dish': 120}
The number of distinct labels are 17


In [None]:
def make_file(biopath, csvpath):                         #Function that reads the files and saves sentences in csv format
  
  my_file_handle=open(biopath)
  df = pd.DataFrame(columns = ['Sentences'])
  word = ' '

  for line in my_file_handle:
    fld = line.split()

    if fld != []:
      word = word + " " + fld[1]
    
    else:
        df = df.append({'Sentences' : word}, ignore_index = True)
        word = ' '

  df = df.append({'Sentences' : word}, ignore_index = True)
  df.to_csv(csvpath)
   

In [None]:
make_file('/content/sample_data/restauranttrain.bio', '/content/sample_data/restauranttrain.csv')
make_file('/content/sample_data/restauranttest.bio', '/content/sample_data/restauranttrain.csv')

In [None]:
def display(path):                                                          #Function to display the sentence csv files
  df = pd.read_csv(path)
  df.drop('Unnamed: 0', axis=1, inplace=True)
  print(df)                                                                       


In [None]:
display('/content/sample_data/restauranttrain.csv')

                                              Sentences
0                2 start restaurants with inside dining
1                          5 star resturants in my town
2                hong kong restaurant reasonable prices
3       a great lunch spot but open till 2 a m passi...
4              a place that serves soft serve ice cream
...                                                 ...
7654    yes please locate the nearest seafood restau...
7655    yes we are looking for a formal restaurant t...
7656    yes we need a to stop at five guys for a nic...
7657    yes we need to find a cheap deli with good h...
7658              yes you can you look for a restaurant

[7659 rows x 1 columns]


In [None]:
display('/content/sample_data/restauranttest.csv')

                                              Sentences
0                     a four star restaurant with a bar
1                              any asian cuisine around
2                   any bbq places open before 5 nearby
3       any dancing establishments with reasonable p...
4              any good cheap german restaurants nearby
...                                                 ...
1516    will waffle house accept a prepaid visa gift...
1517    yes please get me mcdonalds phone number in ...
1518           yes the new diner on south street please
1519    yes we need some chicken for our new diet so...
1520              you can help me with some onion rings

[1521 rows x 1 columns]


In [None]:
def k_means(path, num, labelval):
  df = pd.read_csv(path)
  df.drop('Unnamed: 0', axis=1, inplace=True) 

  vec = TfidfVectorizer(stop_words="english")
  vec.fit(df.Sentences.values)
  features = vec.transform(df.Sentences.values)
  cls = MiniBatchKMeans(n_clusters=num, random_state=0)                          
  cls.fit(features)                                                               

  arr = []

  for i in range(len(cls.labels_)):                                               
    if cls.labels_[i] == labelval:
      arr.append(i)


  for j in range(len(arr)):                                   #Displaying sentences that have the same label = labelval as passed in parameter
    print(df.loc[arr[j]].at['Sentences'])
    print(arr[j])                                                 
                                                              


In [None]:
k_means('/content/sample_data/restauranttrain.csv', 210, 63)       #Sentences having label = 63 are all talking about nearest fast food places

  can you locate the nearest fast food restaurant
870
  can you tell me where the nearest fast food restaurant is
953
  find the nearest fast food joint
2155
  how far is the nearest fast food restaurant
2487
  please give me the nearest fast food joint to me
5056
  where is the nearest fast food hamburger restaurant
7211
  where is the nearest fast food restaurant
7213
  wheres the nearest fast food
7434


In [None]:
k_means('/content/sample_data/restauranttest.csv', 120, 43 )  #Sentences having label = 43 are all talking about places with a smoking section

  are there any restaurants around with a smoking area
72
  can you find a burger joint with a smoking section
133
  can you find me chinese restaurant with a smoking section
167
  can you locate a diner that has a smoking section in this area
191
  does caribe have a smoking area
279
  does mikes cafe have a smoking section
291
  does the chinese buffet on 6 th avenue have a smoking section
311
  i need a place for smoking
582
  i want to find a restaurant with an outdoor dining section that permits smoking
662
  is there a smoking section at olive garden
872
  please find a mexican restaurant with a smoking section and more then 1 star review
1018
  please help me locate a restaurant that allows smoking
1031
  red lobster with no smoking
1043
  show me all of the local restaurants with a smoking area
1055
  what kosher restaurants still have smoking sections
1161
  where is the closest non smoking restaurant
1415
  which restaurant has a smoking section
1491


In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.4 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 18.2 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2

In [None]:
def convertlist(path):                                            #Function that helps in preparing the list
  df = pd.read_csv(path)
  sentences = []

  for i in range(df.shape[0]):
    sentences.append(df.loc[i].at['Sentences'])

  return sentences


In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
def transformer_clustering(path, num, labelval):                  #Function that performs sentence transformer based clustering
  model = SentenceTransformer('all-MiniLM-L6-v2')
  df = pd.read_csv(path)
  #Our sentences we like to encode
  sentences = convertlist(path)
  sentence_embeddings = model.encode(sentences)
  cls = MiniBatchKMeans(n_clusters=num, random_state=0)                          
  cls.fit(sentence_embeddings)                                                               

  arr = []

  for i in range(len(cls.labels_)):                                               
    if cls.labels_[i] == labelval:
      arr.append(i)

  for j in range(len(arr)):                                   #Displaying sentences that have the same label = labelval as passed in parameter
    print(df.loc[arr[j]].at['Sentences'])
    print(arr[j])                     


In [None]:
transformer_clustering('/content/sample_data/restauranttrain.csv', 210, 55)       #Sentences having label = 55 are all talking about pizza restaurants

  about how much will it cost to eat at johns pizza cafe for dinner per person
11
  call the pizza hut closest to my house that offers delivery
508
  can i get to the ashland pizza palace in 5 minutes or less
566
  can i have the phone number for dominos pizza
581
  can you dine in the bar at cjs gourmet pizza
608
  can you direct me to the nearest needham house of pizza
615
  can you get me the number of the burrito place on 52nd street
798
  can you get the phone numbers of all pizza delivery places within 10 miles
802
  can you locate the phone number for the nearest bj pizza place
872
  can you show me the locations of pizza huts in new york city
921
  could you get me the phone number for a wonderful pizza place in the area that has thin and thick crusts
1022
  directions to pizza hut
1067
  do you know if any pizza store in the financial district delivers
1167
  does bridgeside house of pizza sell subs
1299
  does garden pizza have carry out before 9 am
1337
  does the pizzeria o

In [None]:
transformer_clustering('/content/sample_data/restauranttest.csv', 120, 15)      #Sentences having label = 15 are all talking about Thai restaurants

  are there any restaurant nearby that serve thai food
71
  can you find a thai japanese fusion restaurant in town
146
  can you find me a thai restaurant that is caual
166
  can you tell me where an affordable burmese place is
207
  find a thai cuisine within 2 miles
349
  find me a cheaply priced thai restaurant
360
  find me a thai restaurant with a great rating
413
  find thai food
448
  hi i would like some thai food is there any nearby
490
  how far am i from true thai right now
508
  i need a thai place nearby can you find one
588
  i want a pad thai place around here thats open all day
624
  id really like a thai restaurant that has carryout do you know of one
714
  im hungry for thai
723
  is there a thai restaurant with a great wine list
879
  is there an restaurant in this part of town that serves thai food
898
  navigate me to a thai restaurant thats 4 stars or higher
1009
  please find me the nearest thai place
1025
  thai
1081
  what is the highest rated thai restaurant i

In [None]:
etrain = pd.read_csv('/content/sample_data/engtrain.bio', names=['Label-words'])
etest = pd.read_csv('/content/sample_data/engtest.bio', names=['Label-words'])

In [None]:
#Original movie dataset

print(etrain)
print(etest)

                   Label-words
0                      O\twhat
1                    O\tmovies
2                      O\tstar
3               B-ACTOR\tbruce
4              I-ACTOR\twillis
...                        ...
52406                    O\tim
52407               O\tlooking
52408                   O\tfor
52409                     O\ta
52410  B-RATINGS_AVERAGE\tvery

[52411 rows x 1 columns]
                Label-words
0                    O\tare
1                  O\tthere
2                    O\tany
3                   O\tgood
4         B-GENRE\tromantic
...                     ...
24681              O\tmovie
24682              O\tabout
24683  B-CHARACTER\tcaptain
24684     I-CHARACTER\tjack
24685  I-CHARACTER\tsparrow

[24686 rows x 1 columns]


In [None]:
etrain = modify(etrain)                         #Using modify funtion 
etest = modify(etest)

In [None]:
etrain

Unnamed: 0,Label,Words
0,O,what
1,O,movies
2,O,star
3,B-ACTOR,bruce
4,I-ACTOR,willis
...,...,...
52406,O,im
52407,O,looking
52408,O,for
52409,O,a


In [None]:
etest

Unnamed: 0,Label,Words
0,O,are
1,O,there
2,O,any
3,O,good
4,B-GENRE,romantic
...,...,...
24681,O,movie
24682,O,about
24683,B-CHARACTER,captain
24684,I-CHARACTER,jack


In [None]:
calculate(etrain)           #Stats for movie training dataset

{'O': 32700, 'B-ACTOR': 1980, 'I-ACTOR': 2126, 'B-YEAR': 1217, 'B-TITLE': 1438, 'B-GENRE': 1856, 'I-GENRE': 435, 'B-DIRECTOR': 801, 'I-DIRECTOR': 830, 'B-SONG': 224, 'I-SONG': 402, 'B-PLOT': 1088, 'I-PLOT': 1316, 'B-REVIEW': 189, 'B-CHARACTER': 340, 'I-CHARACTER': 294, 'B-RATING': 725, 'B-RATINGS_AVERAGE': 604, 'I-RATINGS_AVERAGE': 575, 'I-TITLE': 1946, 'I-RATING': 364, 'B-TRAILER': 84, 'I-TRAILER': 5, 'I-REVIEW': 122, 'I-YEAR': 726}
The number of distinct labels are 25


In [None]:
calculate(etest)            #Stats for movie testing dataset

{'O': 14929, 'B-GENRE': 1116, 'I-GENRE': 221, 'B-YEAR': 719, 'I-YEAR': 609, 'B-PLOT': 490, 'I-PLOT': 495, 'B-RATINGS_AVERAGE': 450, 'I-RATINGS_AVERAGE': 402, 'B-ACTOR': 811, 'I-ACTOR': 861, 'B-TITLE': 561, 'I-TITLE': 855, 'B-SONG': 53, 'B-CHARACTER': 89, 'B-RATING': 499, 'I-RATING': 225, 'B-REVIEW': 55, 'B-DIRECTOR': 455, 'I-DIRECTOR': 495, 'I-REVIEW': 44, 'I-SONG': 118, 'I-CHARACTER': 74, 'B-TRAILER': 29, 'I-TRAILER': 7}
The number of distinct labels are 25


In [None]:
make_file('/content/sample_data/engtrain.bio', '/content/sample_data/engtrain.csv')         #Making the csv file and saving the words       
make_file('/content/sample_data/engtest.bio', '/content/sample_data/engtest.csv')

In [None]:
display('/content/sample_data/engtrain.csv')

                                              Sentences
0                         what movies star bruce willis
1       show me films with drew barrymore from the 1...
2       what movies starred both al pacino and rober...
3       find me all of the movies that starred harol...
4       find me a movie with a quote about baseball ...
...                                                 ...
5536    im looking for a short film directed by juli...
5537    im looking for a specific unrated biography ...
5538    im looking for a thriller movie from 1960 th...
5539    im looking for a thriller that was directed ...
5540                              im looking for a very

[5541 rows x 1 columns]


In [None]:
display('/content/sample_data/engtest.csv')

                                              Sentences
0       are there any good romantic comedies out rig...
1                  show me a movie about cars that talk
2       list the five star rated movies starring mel...
3       what science fiction films have come out rec...
4       did the same director make all of the harry ...
...                                                 ...
2438                  play the trailer dark of the moon
2439                      show me the good thief review
2440                              any james bond movies
2441    find me some adventure movies with indiana j...
2442    what s the title of the movie about captain ...

[2443 rows x 1 columns]


In [None]:
k_means('/content/sample_data/engtrain.csv', 170, 34)       #Sentences having label = 34 are all talking about movies directed by Francis Ford Coppola

  did francis ford coppola direct any comedies
1150
  what movies has francis ford coppola directed
2098
  are there any musicals directed by francis ford coppola
3849
  did francis ford coppola direct a romantic comedy
4515
  did francis ford coppola direct any highly rated western films
4516
  did francis ford coppola ever direct a pg rated avant garde in the 1960 s
4517
  has francis ford coppola directed any award winning documentary films
5046
  has francis ford coppola directed any teen movies lately
5047
  has francis ford coppola directed any thriller films
5048


In [None]:
k_means('/content/sample_data/engtest.csv', 130, 61)    #Sentences having label = 61 are all talking about movie trailers

  was there a trailer for bowling for columbine
153
  show me the latest trailer for the avengers
211
  show me the trailer for rage
230
  find a trailer for space 2010
291
  where can i find a full length trailer for dark shadows
338
  play a trailer for erin brockovich
349
  show me a trailer for an alec guiness flick
385
  show me the trailer for the ryan gosling movie
399
  is the trailer for the dark knight rises out on the internet
467
  play a trailer for teen wolf
500
  what movie had the earth explode in the trailer
508
  where can i see the trailer for the new 2012 avengers movie
514
  run a trailer for an audry murphy show
524
  run a trailer for excalibur
526
  run a trailer for journey back to oz
626
  find a trailer for chariots of fire
957
  is there a trailer out for advengers yet
963
  play a trailer from the others
1008
  can u please show me the trailer of kiss me kate
2396
  can i see the trailer for kiss me kate
2397
  play the trailer
2433
  play the trailer
2434


In [None]:
transformer_clustering('/content/sample_data/engtrain.csv', 170, 33)    #Sentences having label = 33 are all talking about cartoons and aniamtion in movies 

  find me 3d animated movies
221
  what animated fish says keep swimming all the time
368
  id like that r rated animated movie from last year
594
  what animated pg movies were released in 2010
867
  which cartoon film first won an oscars
1293
  name a pixars film that features a robot
1310
  are any r rated movies animated
1715
  what is the pixar movie about a flying house
1845
  when was the first pixar movie released
1947
  name the first cartoon film to won an oscars for best animated feature
1985
  find me a rate r animation movie
2133
  whats a comedy that uses both live action and animation
2150
  find me an animated movie from 1995
2209
  are there any r rated movies that are animated
2459
  was toy soldiers an animated movie
2576
  are there any animated movies with rabbits from the 2010s
2923
  find me animated movies that are rated r
2926
  what g rated movies are not animated
3008
  do any cartoons have good reviews
3010
  show me an animated movie about a lizard
3161
  f

In [None]:
transformer_clustering('/content/sample_data/engtest.csv', 130, 103)   #Sentences having label = 103 are talking about wedding movies

  who directed princess bride
93
  who directed the film the lorax
216
  is princess bride a good film for children
264
  how many oscars was bridesmaids nominated for
352
  the director april maiya directed many romantic wedding movies in the 2000 s
1776
  who starred in bridesmaids
2321
  who starred in dance of the dragon
2322
