In [2]:
import pandas as pd
import io
import random
import time
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_extraction.text import HashingVectorizer
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

#nltk.download('popular', quiet=True) # for downloading packages
#!pip3 install rake-nltk


#print("\nROBO: So you want to see a new movie now right?")

df_movies = pd.read_csv("IMDb movies.csv")
df_ratings = pd.read_csv("IMDb ratings.csv")
with open('chatbot.txt','r', encoding='utf8', errors ='ignore') as fin:
    raw = fin.read().lower()

sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

lemmer = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "how are you","hey","Hi Bot", "Hi")
GREETING_RESPONSES = ["Hi", "Hey", "*nods*", "Hi there", "Hello","Hi dude, Im good", "I am glad! You are talking to me"]

def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
df_movies = df_movies[['imdb_title_id','title', 'duration', 'year', 'genre', 'language', 'actors', 'director','description']]
df_ratings = df_ratings[['imdb_title_id', 'mean_vote', 'weighted_average_vote','median_vote', 'total_votes']]
df = pd.merge(df_movies, df_ratings, on='imdb_title_id')
df.dropna(inplace = True)

df2 = df[df['language'].str.contains(r'English')]
df2 = df2.loc[(df2['mean_vote'] >= 6) & (df['total_votes'] >= 2000)] # Take all English Movies with Rating greater than 6
df2 = df2[df2['year'] >= 1970]
df3 = df[df['language'].str.contains(r'Tamil|Kannada|Telugu|Hindi|Malayalam')]
df3 = df3[(df3['mean_vote'] >= 5) & (df3['total_votes'] >= 500)]

df = pd.concat([df2,df3])
df = df.apply(lambda x: x.str.lower() if(x.dtype == 'O') else x)
df = df.drop_duplicates(subset=['title','year'], keep = 'last')
df.reset_index(drop=True,inplace=True)
df.shape


(11527, 13)

In [3]:
df['Key_words'] = ''
r = Rake()
for index, row in df.iterrows():
    r.extract_keywords_from_text(row['description'])
    key_words_dict_scores = r.get_word_degrees()
    row['Key_words'] = list(key_words_dict_scores.keys())
    df['Key_words'][index] = row['Key_words']

df['genre'] = df['genre'].map(lambda x: x.split(','))
for index, row in df.iterrows():
    row['genre'] = [x.lower().replace(' ','') for x in row['genre']]

df['Bag_of_words'] = ''
columns = ['Key_words', 'genre']
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['Bag_of_words'] = words
    df['Bag_of_words'][index] = words
dfn = df[['title','Bag_of_words']]

In [4]:
def cosine_similarity_n_space(m1, m2, batch_size=10000):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]))
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break # cause I'm too lazy to elegantly handle edge cases
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

In [5]:
count = CountVectorizer()
count_matrix = count.fit_transform(dfn['Bag_of_words'])
csmain = cosine_similarity_n_space(count_matrix, count_matrix)

In [6]:
indices = pd.Series(dfn['title'])
def recommend(title, num=10, cosine_sim = csmain):
    recommended_movies = []
    try:
        idx = indices[indices == title].index[0]
        score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
        top_10_indices = list(score_series.iloc[1:num+1].index)

        for i in top_10_indices:
            recommended_movies.append(list(dfn['title'])[i])
        print("\nGreat Choice. Here is the list of similar movies:")
        k = 1;
        for i in recommended_movies:
            print(k,i.title())
            k+=1;
        return False
    except:
        print("ROBO: I'm sorry but I could not find such a movie in our Database.")
        print("ROBO: I'd recommend you to check the spelling of the movie you entered.")
        print("ROBO: Also make sure it belongs to the same genre and language you had entered before.")
        return True

In [7]:
data = pd.read_csv("labeledTrainData.tsv",sep="\t")
data.head()
X = data.review
y = data.sentiment
#Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 4)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.2)
#Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)
y_pred = NB.predict(X_test_dtm)
#print('\nNaive Bayes')
#print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
#print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

tokens_words = vect.get_feature_names()
#print('\nAnalysis')
#print('No. of tokens: ',len(tokens_words))
counts = NB.feature_count_
df_table = {'Token':tokens_words,'Negative': counts[0,:],'Positive': counts[1,:]}
tokens = pd.DataFrame(df_table, columns= ['Token','Positive','Negative'])
positives = len(tokens[tokens['Positive']>tokens['Negative']])
#print('No. of positive tokens: ',positives)
#print('No. of negative tokens: ',len(tokens_words)-positives)
#Check positivity/negativity of specific tokens
token_search = ['horrendous']
#print('\nSearch Results for token/s:',token_search)
#print(tokens.loc[tokens['Token'].isin(token_search)])
#Analyse False Negatives (Actual: 1; Predicted: 0)(Predicted negative review for a positive review) 
#print(X_test[ y_pred < y_test ])
#Analyse False Positives (Actual: 0; Predicted: 1)(Predicted positive review for a negative review) 
#print(X_test[ y_pred > y_test ])

In [None]:
flag=True
print("ROBO: Hi there")
print("ROBO: My name is Robo. Type 'help' for guidance, 'review' to review a movie and let others know your opinion, 'recommend' so we can recommend you a movie based on your preference :)")
print("ROBO: Type Quit to exit.")

while(flag==True):
    print("\nROBO: Enter Recommend or Review or Quit[to quit lol :)]")
    user_response = input().lower()
    
    if(user_response==r'recommend'):
        ans = True
        while (ans):
            print("ROBO: Please refer to Imdb for the exact movie Name.")
            user_res = input("Enter the movie which you have in mind.\n").lower()
            num = int(input("How many such similar movies do you want??\n"))
            ans = recommend(user_res,num)
        print("\nROBO: I think you will love to watch these movies.")
        print("ROBO: Do get back to us regarding your views on these films!!")
        flag = True
        
    elif(user_response==r'review'):
        print("ROBO: Alright...")  
        print("ROBO: Give me a moment")
        trainingVector = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 5)
        trainingVector.fit(X)
        X_dtm = trainingVector.transform(X)
        NB_complete = MultinomialNB()
        NB_complete.fit(X_dtm, y)
        #Input Review
        print('\nTest a custom review message')
        mov_str = input("Enter the Movie Name.\n").lower()
        print('Enter review to be analysed: ', end=" ")
        test = []
        test.append(input())
        test_dtm = trainingVector.transform(test)
        predLabel = NB_complete.predict(test_dtm)
        tags = ['Negative','Positive']
        #Display Output
        print('\nThe review is predicted:',tags[predLabel[0]])
        flag = True
        
        if tags[predLabel[0]] == 'Negative':
            print("\nROBO: Ohhh!! I don't think you liked that movie.")
            print("\nROBO: We'll take a note of that. Thanks for contributing to our database")
        else:
            print("\nROBO: Ohhh!! That's great I think you could take a look at our Movie Recommender to watch Similar movies.")
            print("ROBO: We shall search our database to suggest similar movies to the one you have watched")
            print("ROBO: Give me a second.....")
            time.sleep(1)
            ans = recommend(mov_str,10)
            flag = True
            
    elif(greeting(user_response)!=None):
        print("ROBO: "+greeting(user_response))
        flag = True
        
    elif(user_response == r'quit'):
        print("ROBO: Thanks for calling me")
        print('ROBO: Bye! Have a good day')
        flag = False
    else :
        print("Ohh I'm sorry I didnt get you. Please enter something valid")
        flag = True