In [None]:
import glob
import os
import json
import re
from itertools import chain
from string import punctuation
import random

import pandas as pd
import numpy as np

In [13]:
# GDrive for GCollab
file_path = 'drive/MyDrive/Colab Notebooks/moviedata.csv'
root_path = 'drive/MyDrive/Colab Notebooks/'

# Local File System
# file_path = 'dataset/moviedata.csv'
# root_path = 'dataset/'

In [None]:
#Data Cleaning
def pre_process(text):
  text = clean_text(text)
  text = remove_spaces(text)
  return text

def remove_spaces(text):
  text = re.sub(r" ","",text)
  return text

def remove_commas(text):
  text = re.sub(r","," ",text)
  text = re.sub(r"[-{}\"#/@:;<>+=~()|.!?]","",text)
  return text

def replace_commas(text,token):
  text = re.sub(r"[-{}\"#/@:;<>+=~()|.!?]","",text)
  text = re.sub(r",",token,text)
  return text+token

def extract_year(text,token):
  text = text.lower()
  match = re.match(r'.*([1-3][0-9]{3})',text)
  if match is not None:
    return match.group(1)+token
  else:
    return ""

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-{}\"#/@:;<>+=~()|.?,]", "", text) 
    return text

In [None]:
df = pd.read_csv(file_path)
df = df[['Rank','Title','Genre','Director','Actors']]
df = df.dropna()
df = df.apply(lambda x: x.astype(str).str.lower())
df.tail()

Unnamed: 0,Rank,Title,Genre,Director,Actors
995,996,secret in their eyes,"crime,drama,mystery",billy ray,"chiwetel ejiofor, nicole kidman, julia roberts..."
996,997,hostel: part ii,horror,eli roth,"lauren german, heather matarazzo, bijou philli..."
997,998,step up 2: the streets,"drama,music,romance",jon m. chu,"robert hoffman, briana evigan, cassie ventura,..."
998,999,search party,"adventure,comedy",scot armstrong,"adam pally, t.j. miller, thomas middleditch,sh..."
999,1000,nine lives,"comedy,family,fantasy",barry sonnenfeld,"kevin spacey, jennifer garner, robbie amell,ch..."


In [None]:
genres = set()
persons = set()
numbers = set()
for index, row in df.iterrows():
  entry = row['Genre']
  for e in entry.split(','):
    genres.add(e.strip())

  entry = row['Director']
  for e in entry.split(','):
    persons.add(e.strip())
    
  entry = row['Actors']
  for e in entry.split(','):
    persons.add(e.strip())
  
  entry = row['Rank']
  for e in entry.split(','):
    numbers.add(e.strip())

genres_list = list(genres)
persons_list = list(persons)
numbers_list = list(numbers)

print("Found %d genres"%len(genres_list))
print("Found %d persons"%len(persons_list))
print("Found %d genres"%len(numbers_list))

Found 20 genres
Found 2593 persons
Found 1000 genres


In [None]:
data = df.copy()
data['Genre'] = data['Genre'].map(lambda x: remove_commas(x))
data['Director'] = data['Director'].map(lambda x: remove_commas(x))
data['Actors'] = data['Actors'].map(lambda x: remove_commas(x))
data['Rank'] = data['Rank'].map(lambda x: remove_commas(x))
input_text = data['Genre'].str.cat(data[['Director','Actors','Rank']].astype(str), sep=" ")

data = df.copy()
data['Genre'] = data['Genre'].map(lambda x: replace_commas(x,'> '))
data['Director'] = data['Director'].map(lambda x: replace_commas(x,'# '))
data['Actors'] = data['Actors'].map(lambda x: replace_commas(x,'# '))
data['Rank'] = data['Rank'].map(lambda x: replace_commas(x,'+ '))
target_text = data['Genre'].str.cat(data[['Director','Actors','Rank']].astype(str), sep=" ")

data['input_text'] = input_text
data['target_text'] = target_text
data['prefix'] = 'ner'
data = data[['prefix','input_text', 'target_text']]
data.tail()

Unnamed: 0,prefix,input_text,target_text
995,ner,crime drama mystery billy ray chiwetel ejiofor...,crime> drama> mystery> billy ray# chiwetel e...
996,ner,horror eli roth lauren german heather mataraz...,horror> eli roth# lauren german# heather ma...
997,ner,drama music romance jon m chu robert hoffman ...,drama> music> romance> jon m chu# robert hof...
998,ner,adventure comedy scot armstrong adam pally tj...,adventure> comedy> scot armstrong# adam pall...
999,ner,comedy family fantasy barry sonnenfeld kevin s...,comedy> family> fantasy> barry sonnenfeld# k...


In [None]:
# Generating data for NER
# > - Genres
# # - Persons
# + - Numbers
s1="Suggest me some %s movies directed by %s and acted by %s"
s2="What happened in the year %s"

In [None]:
train_data=[]
for i,entry in enumerate(persons_list):
  actor = random.choice(persons_list)
  director = random.choice(persons_list)
  genre = random.choice(genres_list)
  train_data.append({
      'input_text' : s1%(genre,director,actor),
      'target_text': genre + '> ' + director + '# ' + actor + '# '
  })

for entry in numbers:
  train_data.append({
      'input_text' : s2%(entry),
      'target_text': entry + '+ '
  })
train_data[-1],len(train_data)

({'input_text': 'What happened in the year 571', 'target_text': '571+ '}, 3593)

In [None]:
train_df = pd.DataFrame(train_data)
train_df['prefix'] = 'ner'
train_df.append(data)
train_df.reset_index(inplace=True,drop=True)
train_df = train_df.apply(lambda x:x.astype(str).str.lower())
train_df.to_csv(root_path+"ner_data.csv")

In [None]:
train_df.tail()

Unnamed: 0,input_text,target_text,prefix
3588,what happened in the year 235,235+,ner
3589,what happened in the year 764,764+,ner
3590,what happened in the year 395,395+,ner
3591,what happened in the year 418,418+,ner
3592,what happened in the year 571,571+,ner
