In [None]:
%matplotlib inline

## 0. Imports and config

### 0.0. Imports

In [None]:
import plotly.plotly as py
import plotly.tools as tools
import plotly.figure_factory as ff
import plotly.graph_objs as go

In [None]:
from http.client import ResponseNotReady, CannotSendRequest
from xmlrpc.client import ProtocolError
import requests
from bs4 import BeautifulSoup

In [None]:
from datetime import datetime
import dateparser
import pandas as pd
import itertools
import re
import imdb
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
import time
from pythonopensubtitles.opensubtitles import OpenSubtitles
import gzip
import shutil
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import sys
import io
import pickle
from collections import Counter

In [None]:
from bokeh.palettes import Spectral11
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label, HoverTool

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import LdaMulticore
from gensim import corpora
from nltk.corpus import stopwords
from string import punctuation
import spacy

In [None]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.optimizers import RMSprop
from keras.backend import clear_session
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

### 0.2. Config

In [None]:
tools.set_credentials_file(username='USERNAME', api_key='APIKEY')

In [None]:
nlp = spacy.load("en_core_web_lg")

## 1. Title Scraping

### Wikipedia

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_Hallmark_Channel_Original_Movies"
r  = requests.get(url)

In [None]:
data = r.text
soup = BeautifulSoup(data, "lxml")

### Soup analysis

#### 2000 - 2009

In [None]:
years_00_09 = [x.text for x in soup.find_all("span", {"class": "mw-headline", "id" : lambda L: L and L.startswith('200')}) if not "–" in x.text]

In [None]:
ols = [x for x in soup.find_all("ol")][:len(years_00_09)]

In [None]:
movies_00_09 = []

for i, movie in enumerate(ols):
    lists = movie.find_all("li")
    for li in lists:
        name = li.find("i").text
        date = li.find_all(text=True, recursive=False)
        date = [x.text for x in nlp(str(date[0])).ents if x.label_ == "DATE"]
        
        final_date = 2000
        
        if len(date) == 0:
            final_date = years_00_09[i]
        else:
            final_date = date[0]
            
            # some of the earlier ones miss their year
            if not years_00_09[i] in final_date:
                final_date = final_date + ", " + years_00_09[i]
                
        
        # format the date
        date_formatted = dateparser.parse(final_date)
        
        if not date_formatted:
            print(final_date)
            
        movies_00_09.append((name ,date_formatted))

#### 2010 - 2017

In [None]:
years = [x.text for x in soup.find_all("span", {"class": "mw-headline", "id" : lambda L: L and L.startswith('201')}) if not "–" in x.text]

In [None]:
movies_10_18 = []
tables = soup.find_all('table', {'class':'wikitable sortable'})

index_of_interest = [3,5,7,9,11,13,15,17,19]

tables_of_interest = [tables[i] for i in index_of_interest]

for table in tqdm_notebook(tables_of_interest):
    headers = [x.text for x in table.find_all('th')]
    
    rows = table.find_all("tr")
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 0:
            
            cols = [cols[i] for i in range(len(cols)) if headers[i].strip("\n") == "Movie" or headers[i].strip("\n") == "Original airdate"]
            cols = [ele.text.strip() for ele in cols]
            
            movies_10_18.append((cols[0], dateparser.parse(cols[1])))

#### One big happy dataframe

In [None]:
df_movies = pd.DataFrame({
    "title":[x[0] for x in movies_00_09 + movies_10_18 ],
    "date":[x[1] for x in movies_00_09 + movies_10_18 ]
})

In [None]:
df_movies.tail()

### How different are they

In [None]:
keywords = ["royal","christmas","wedding","prince", "love"]
permutations = list(itertools.permutations(keywords, 2))

In [None]:
def get_lemmas(title):
    doc = nlp(title)
    return [x.lemma_.lower() for x in doc]

In [None]:
def is_subset(a,b):
    return any(set(x).issubset(b) for x in a)

In [None]:
df_movies["title_lemma"] = df_movies["title"].apply(get_lemmas)

In [None]:
df_movies["a_in_b"] = df_movies.apply(lambda x: is_subset(permutations, x['title_lemma']), axis=1)

In [None]:
df_movies[df_movies["a_in_b"] == True]

Other movie name ideas:
- a christmas prince
- a prince's wedding

### How many movies per month

In [None]:
df_movies_date = df_movies.copy()
df_movies_date.set_index("date", inplace=True)
df_movies_date.index = pd.DatetimeIndex(df_movies_date.index)

In [None]:
df_movies_date_count_christmas = df_movies_date[df_movies_date.title.str.contains("Christmas")].resample('M').count()

In [None]:
df_movies_date_count = df_movies_date.resample('M').count()

In [None]:
trace1 = go.Scatter(
    x=df_movies_date_count[df_movies_date_count.index.year > 2009].index,
    y=df_movies_date_count[df_movies_date_count.index.year > 2009].title,
    fill='tozeroy',
    mode="none",
    name="... in general"
)

trace2 = go.Scatter(
    x=df_movies_date_count_christmas[df_movies_date_count_christmas.index.year > 2009].index,
    y=df_movies_date_count_christmas[df_movies_date_count_christmas.index.year > 2009].title,
    fill='tozeroy',
    mode="none",
    name="... with 'Christmas' in title"
)


layout = go.Layout(
    title='Movies launched per month',
    xaxis=dict(
        title='Month',
        titlefont=dict(
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Title launches',
        titlefont=dict(
            size=18,
            color='#7f7f7f'
        )
    )
)

data = [trace1,trace2]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='td_medium_nlp_1')

### Most popular words

In [None]:
stop_en = stopwords.words("English")

In [None]:
lemmas =[x for sublist in df_movies.title_lemma for x in sublist if not 
         (x in stop_en 
          or x in list(punctuation)
          or x in ["-pron-"]
         )]

In [None]:
for x in Counter(lemmas).most_common(20):
    print(x[0],",",x[1])

### Persist

In [None]:
df_movies.to_pickle("df_movies_base.pickle")

In [None]:
df_movies = pd.read_pickle("df_movies_base.pickle")

## 2. Subtitles scraping

### Get IMDB id for every movie

In [None]:
ia = imdb.IMDb()

In [None]:
def get_imdb_id(title, year):

    movieid = 0
    movieyear = None
    movies = ia.search_movie(title)
    if movies:
        if len(movies) == 1:
            if "year" in movies[0].keys():
                movieyear = movies[0]["year"]

            if "title" in movies[0].keys():
                movietitle = movies[0]["title"]

            movieid = movies[0].movieID
            
        elif len(movies) > 1:      
            for mov in movies:
                if 'title' in mov.keys() and mov["title"].lower() == title.lower():
                    if "year" in mov.keys():
                        movieyear = mov["year"]
                        
                    movieid = mov.movieID
                    break

            if movieyear:
                if year in [movieyear - 1, movieyear, movieyear + 1]:
                    return movieid
                else:
                    return None
            else:
                return movieid
        
            
    else:
        title_nosquare = re.sub(r'\[[^)]*\]', '', title).strip()
        title_nonothing = re.sub(r'\([^)]*\)', '', title_nosquare).strip()
        
        # check if squares are a match
        if title == title_nosquare:
            pass
        else:
            movieid = get_imdb_id(title_nosquare, year)
           
        # check if nothing is a match
        if not movieid:
            if title == title_nonothing:
                return None
            else:
                movieid = get_imdb_id(title_nonothing, year)
        
    return movieid
        

In [None]:
%%time
print(get_imdb_id("A Royal Winter (Winterfest)[229]",2017))

In [None]:
tqdm_notebook.pandas(desc="my bar!")

In [None]:
%%time
df_movies["imdb_id"] = df_movies.progress_apply(lambda x: get_imdb_id(x["title"], x['date'].year), axis=1)

In [None]:
df_movies.to_pickle("df_movies_imdbid.pickle")

In [None]:
df_movies = pd.read_pickle("df_movies_imdbid.pickle")

### Get download links

In [None]:
ost = OpenSubtitles()

In [None]:
token = ost.login('EMAIL', 'PASSWORD')

In [None]:
assert type(token) == str

In [None]:
def get_download_link(imdb_id):
    
    link = None
    
    if imdb_id:
        data = ost.search_subtitles([{'sublanguageid': 'en', 'imdbid':str(imdb_id)}])
        
        if data and len(data) > 0:
            for y in data:
                if y["ISO639"] == 'en':
                    print("found english sub for movie ", str(imdb_id))
                    link = y["SubDownloadLink"]
    
    time.sleep(10)
    return link

In [None]:
subtitle_links = {}

In [None]:
i_no_new_found = 0

while i_no_new_found < 1:
    
    xFound = False
    
    for i, row in tqdm_notebook(df_movies[(~df_movies["imdb_id"].isin(subtitle_links.keys())) & (~pd.isnull(df_movies["imdb_id"]))].iterrows()):
        try:
            result = get_download_link(row["imdb_id"])
            if result:
                subtitle_links[row["imdb_id"]] = result
                xFound = True
                i_no_new_found = 0
        except ResponseNotReady as e:
            print("response-not-ready-error", time.ctime())
            time.sleep(5)
            pass
        except CannotSendRequest as e:
            print("cannot-send-request-error", time.ctime())
            time.sleep(5)
        except ProtocolError as e:
            print("protocol-error", time.ctime())
            time.sleep(5)
            pass
        except KeyboardInterrupt:
            raise
            
    if not xFound:
        i_no_new_found += 1

In [None]:
with open("ost_subtitle_links_dict.pickle",'wb') as file:
    pickle.dump(subtitle_links,file)

In [None]:
with open("ost_subtitle_links_dict.pickle",'rb') as file:
    subtitle_links = pickle.load(file)

In [None]:
df_movies["downloadlink"] = df_movies["imdb_id"].map(subtitle_links)

In [None]:
df_movies.to_pickle("df_movies_links.pickle")

In [None]:
df_movies = pd.read_pickle("df_movies_links.pickle")

## 3. Download subs

In [None]:
df_movies_sorted = df_movies[~pd.isnull(df_movies['downloadlink'])].sort_values(by="imdb_id")

In [None]:
def download_unzip_subtitle(link, name):
    if not pd.isnull(link):
        try:
            time.sleep(1)
            gz_name = name + '.gz'
            with open(gz_name, "wb") as f:
                r = requests.get(link)
                f.write(r.content)


            with gzip.open(gz_name, 'rb') as f_in:
                with open(os.path.join("downloaded_subtitles", name + '.txt'), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        except Exception as e:
            print(e)
            print(name)
            print(link)
            raise

In [None]:
%%time

for i, row in tqdm_notebook(df_movies_sorted.iterrows()):
    if not row["imdb_id"] in [x.strip(".txt") for x in os.listdir("downloaded_subtitles")]:
        download_unzip_subtitle(row["downloadlink"], row["imdb_id"])

## 4. Process the data
We need to read in all of the downloaded subs and process them a bit.

In [None]:
basepath = "downloaded_subtitles/"

In [None]:
re_string = re.compile("\[{1}\S+\]{1}:{1}")

In [None]:
def process_subtitle_text(txt):
    cleanlines = []
    
    for line in txt:
        line = line.strip("\n")
        line = line.replace("\'","'")
        line = line.strip(" - ")
        line = line.replace("<i>",'').replace("</i>","")
        if line.isupper():
            line = line.lower()
    
        # For each line: remove if it is just a number:
        if line.isdigit():
            continue
          
        # Also: if it is a time range: remove it
        elif len(line.split(" --> ")) == 2:
            continue
            
        # If not: it is a valid spoken line, but some cleaning is still needed
        else:
            # Excess points indicate a run-on, so remove an uppercase
            if line[:3] == '...':
                line = line[4:]
                line = line[0].lower() + line[1:]
                
                
            if line[-3:] == '...':
                line = line.strip("...")
            
            # Remove names from in between brackets, unless it is actually a usefull words. Example: [Nikki]:
            for ref in re_string.findall(line):
                line = line.replace(ref,"")
            
            cleanlines.append(line)
            
    return ' '.join(cleanlines).replace("  "," ").replace("  "," ")

In [None]:
sub_dict = {}
for sub in tqdm_notebook(os.listdir(basepath)):
    if sub[-4:] == ".txt":
        with open(os.path.join(basepath,sub),"r",encoding='utf8', errors="replace") as file:
            try:
                file_text = file.readlines()
                sub_dict[sub[:-4]] = process_subtitle_text(file_text)
            except KeyboardInterrupt:
                raise
            except Exception as e:
                print(e)
                print(sub)

In [None]:
df_movies["subtitles_text"] = df_movies.imdb_id.map(sub_dict)

In [None]:
df_movies.to_pickle("df_movies_subtitles.pickle")

In [None]:
df_movies = pd.read_pickle("df_movies_subtitles.pickle")

problems:
- camelcase
- non-eng
- sometimes still 'NICK: Hey! How's it going' > remove name, 'JENNIE: I\'m sure. B'
- still lots of \x00
- sometimes some </ i> tags
- \ufeff1
- <font color="#9ae965">sync and correction by solfieri www.addic7ed.com</font>'
- remove everything between brackets? not sure

## 5. LDA

The basic idea is that a movie probably consists of a number of diverse topics, I would definitely expect things like:
- love
- weddings
- christmas
- food?

To be distinct topics.

So let's see if LDA agrees

### 5.1. Document processing

In [None]:
stopwords_en = stopwords.words("english")

In [None]:
subtitles_text = df_movies[~pd.isnull(df_movies.subtitles_text)]["subtitles_text"].values
subtitles_imdbid = df_movies[~pd.isnull(df_movies.subtitles_text)]["imdb_id"].values
subtitles_titles = df_movies[~pd.isnull(df_movies.subtitles_text)]["title"].values

In [None]:
processed_docs = [[x.lemma_ for x in nlp(y) if not x.ent_type_ == 'PERSON'] for y in tqdm_notebook(subtitles_text)]

In [None]:
with open("processed_docs_spacy.pickle","wb") as file:
    pickle.dump(processed_docs, file)

In [None]:
with open("processed_docs_spacy.pickle","rb") as file:
    processed_docs = pickle.load(file)

In [None]:
processed_docs_filtered = [[x for x in y  if not x in stopwords_en and not len(x) < 2 ] for y in tqdm_notebook(processed_docs)]

### 5.2. Gensim LDA

#### Process docs

In [None]:
num_topics = 6

In [None]:
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=1500)

#### Check if we didn't cut out too much

In [None]:
print("christmas" in dictionary.token2id)
print("santa" in dictionary.token2id)
print("wed" in dictionary.token2id)
print("ho" in dictionary.token2id)
print("bride" in dictionary.token2id)
print("cake" in dictionary.token2id)
print("groom" in dictionary.token2id)
print("princess" in dictionary.token2id)

#### Launch algo

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
lda_model = LdaMulticore(bow_corpus, id2word=dictionary, passes=2, workers=2, num_topics=num_topics)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### 5.3. SKlearn TFIDF + LDA

#### Process docs

In [None]:
def identity_tokenizer(text):
    return text

In [None]:
tfidf = TfidfVectorizer(max_df=0.5, min_df=30, tokenizer=identity_tokenizer,  lowercase=False)
processed_docs_tfidf = tfidf.fit_transform(processed_docs_filtered)
tf_feature_names = tfidf.get_feature_names()
processed_docs_tfidf.shape

#### Check if we didn't cut out too much

In [None]:
print("christmas" in tf_feature_names)
print("santa" in tf_feature_names)
print("wed" in tf_feature_names)
print("ho" in tf_feature_names)
print("bride" in tf_feature_names)
print("cake" in tf_feature_names)
print("groom" in tf_feature_names)
print("princess" in tf_feature_names)

#### Launch algo

In [None]:
%%time
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(processed_docs_tfidf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Okay, so LDA didn't exactly fetch out the topics I was hoping for. I did a lot of playing around afterwards as well, trying different hyperparameters, preprocessing methods and topic modelling algo's, but none were really satisfactory.

## 6. Clustering

Basic idea: since topic modelling didn't really allow us to select a subset of movies, perhaps document clustering will. We will represent each document as a TFIDF vector, and perform K-Means clustering on it

In [None]:
num_clusters = 7

### 6.1. K-means

In [None]:
km = KMeans(n_clusters=num_clusters, max_iter=1000)

In [None]:
%%time 
km.fit(processed_docs_tfidf)

In [None]:
clusters = km.labels_.tolist()

In [None]:
mapping = dict(zip(subtitles_imdbid,clusters ))

In [None]:
df_movies["cluster"] = df_movies.imdb_id.map(mapping)

### 6.2. Visualize clusters

#### T-SNE dimens. reduction

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=1000)
tsne_results = tsne.fit_transform(processed_docs_tfidf.toarray())

#### Color dict

In [None]:
# Generate color dict

spectral = Spectral11
colors = []

for cluster in clusters:
    colors.append(spectral[cluster])

#### Visualize

In [None]:
traces = []

data = list(zip(tsne_results,df_movies[~pd.isnull(df_movies.cluster)].title.tolist(), df_movies[~pd.isnull(df_movies.cluster)].cluster.tolist(), colors ))

for i in range(num_clusters):
    
    point_slice = np.asarray([x[0] for x in data if x[2] == i])
    title_slice = [x[1] for x in data if x[2] == i]
    
    trace = go.Scatter(
        x=point_slice[:, 0],
        y=point_slice[:, 1],
        text = title_slice,
        hoverinfo = 'text',
        marker=dict(
            size=16,
            color = spectral[i], #set color equal to a variable
            showscale=False
        ),
        mode="markers",
        name="cluster {}".format(i)
    )
    
    traces.append(trace)


layout = go.Layout(
    title='Clustering of Hallmark movies',
    xaxis=dict(
        title='X tsne value',
        titlefont=dict(
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Y tsne value',
        titlefont=dict(
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='td_medium_nlp_2')

#### Cluster naming

In [None]:
cluster_dict = {
    0:"wedding",
    1:"lovestory_1",
    2:"fantasy",
    3:"seasonal_love",
    4:"christmas",
    5:"lovestory_2",
    6:"food"
}

In [None]:
df_movies["cluster_name"] = df_movies.cluster.map(cluster_dict)

In [None]:
df_movies.to_pickle("df_movies_cluster.pickle")

In [None]:
df_movies = pd.read_pickle("df_movies_cluster.pickle")