In [1]:
import pandas as pd
import io
import random
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_extraction.text import HashingVectorizer
warnings.filterwarnings('ignore')

#nltk.download('popular', quiet=True) # for downloading packages
#!pip3 install rake-nltk

In [3]:
df_movies = pd.read_csv("IMDb movies.csv")
df_ratings = pd.read_csv("IMDb ratings.csv")
with open('chatbot.txt','r', encoding='utf8', errors ='ignore') as fin:
    raw = fin.read().lower()

sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

lemmer = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "how are you","hey","Hi Bot", "Hi")
GREETING_RESPONSES = ["Hi", "Hey", "*nods*", "Hi there", "Hello","Hi dude, Im good", "I am glad! You are talking to me"]

def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [4]:
# Taking a look at all the columns in the dataframe(s)
#print("Columns for Movie Details: \n",df_movies.columns)
#print("Columns for Movie Rating statistics: \n", df_ratings.columns)

# Taking a look at the Movie Details Dataframe
#df_movies.head()

# Taking a look at the Movie Rating Statistics Dataframe
#df_ratings.head()

# Removing all the unwanted columns from the two Dataframes
df_movies = df_movies[['imdb_title_id','title', 'duration', 'year', 'genre', 'language', 'actors', 'director','description']]
df_ratings = df_ratings[['imdb_title_id', 'mean_vote', 'weighted_average_vote','median_vote', 'total_votes']]

#Again Taking a look at all the columns in the dataframe(s) after dropping unwanted columns
#print("Columns for Movie Details: \n",df_movies.columns)
#print("Columns for Movie Rating statistics: \n", df_ratings.columns)

# Merging the two dataframes and dropping all the nan values
df = pd.merge(df_movies, df_ratings, on='imdb_title_id')
#print("Shape, Before dropping Nan Values: ",df.shape)
df.dropna(inplace = True)
dfm = df.copy()
#print("Shape, After dropping Nan Values: ",df.shape)

df2 = df[df['language'].str.contains(r'English')]
#print(df2.shape)
df2 = df2[(df2['mean_vote'] >= 6) & (df['total_votes'] >= 1000)] # Take all English Movies with Rating greater than 6
#print(df2.shape)
df2 = df2[df2['year'] >= 2000]
#print(df2.shape)
df2[df2['title'].str.contains('123')]

df3 = df[df['language'].str.contains(r'Tamil|Kannada|Telugu|Hindi|Malayalam')]
#df3.shape
df3 = df3[(df3['mean_vote'] >= 5) & (df3['total_votes'] >= 500)]
df3[df3['title'].str.contains('Student')]
#df3.shape


df = pd.concat([df2,df3])
df = df.apply(lambda x: x.str.lower() if(x.dtype == 'O') else x)
df.shape

dfm = dfm[(dfm['mean_vote'] >= 5) & (dfm['total_votes'] >= 500)]
dfm = dfm.sort_values(by=['mean_vote'],ascending = False)
dfm = dfm.apply(lambda x: x.str.lower() if(x.dtype == 'O') else x)
print("Normal set:",dfm.shape)
print("Custom Set: ",df.shape)

Normal set: (36075, 13)
Custom Set:  (9540, 13)


In [10]:
print("So we already have a list of some of the popular movies since the 2000s in English, Hindi, Tamil and some other Indian regional languages.")
print("Do you have any preferences?")
print("Want to make your own list?")
ch = input().lower()
if ('yes' in ch) | ('yea' in ch) | ('ya' in ch) | ('ye' in ch):
    # Accepting user input to identify similar movies of their interest
    gen = input("Cool!!\nEnter Preferred genre(s) (if more than one please use a comma)(Type No if not): ").lower()
    df2 = dfm.copy()
    if gen != 'no':
        gen = [x.strip() for x in gen.split(',')]
        df2 = dfm[dfm['genre'].str.contains(gen[0])]
        
        for l in gen[1:]:
            df2 = df2.append(dfm[dfm['genre'].str.contains(l)])
        df2 = df2.drop_duplicates(subset=['title'], keep = False)
    lang = input("Any Preferred Language(s) (if more than one please use a comma)(Type No if not): ").lower()
    df3 = df2.copy()
    if lang != 'no' :
        lang = [x.strip() for x in lang.split(',')]
        df3 = df2[df2['language'].str.contains(lang[0])]
        for l in lang[1:]:
            df3 = df3.append(df2[df2['language'].str.contains(l)])
        df3 = df3.drop_duplicates(subset = ['title'],keep=False)
else:
    df3 = df
print(df3.shape)
df3 = df3.sort_values(by=['mean_vote'],ascending = False)
if df3.shape[0] > 10000:
    df3 = df3[:10000]
if df3.shape[0] == 0:
    print("I'm sorry but you have not selected any movies. Please try again")

So we already have a list of some of the popular movies since the 2000s in English, Hindi, Tamil and some other Indian regional languages.
Do you have any preferences?
Want to make your own list?
No
(9540, 13)


## Creating Matrix to Recommend

In [15]:
df3.reset_index(drop=True,inplace=True)


df3['Key_words'] = ''
r = Rake()
for index, row in df.iterrows():
    r.extract_keywords_from_text(row['description'])
    key_words_dict_scores = r.get_word_degrees()
    row['Key_words'] = list(key_words_dict_scores.keys())
    df3['Key_words'][index] = row['Key_words']

df3['genre'] = df3['genre'].map(lambda x: x.split(','))
for index, row in df3.iterrows():
    row['genre'] = [x.lower().replace(' ','') for x in row['genre']]

df3['Bag_of_words'] = ''
columns = ['genre', 'Key_words']
for index, row in df3.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['Bag_of_words'] = words
    df3['Bag_of_words'][index] = words

In [16]:
dfn = df3[['title','Bag_of_words']]

count = CountVectorizer()
count_matrix = count.fit_transform(dfn['Bag_of_words'])
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_n_space(m1, m2, batch_size=10000):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]))
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break # cause I'm too lazy to elegantly handle edge cases
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

csmain = cosine_similarity_n_space(count_matrix, count_matrix)
indices = pd.Series(df['title'])