In [1]:
import numpy as np
import pandas as pd
import re

## Refine The Data

Here we are mainly computing three three things :-
* Items
* Users 
* Interaction-Matrix

### Creation Of the Items

#### **Items** :- item_id + Metadata features

Now focussing on the items section :-

* movie_id
* title
* year(release)
* genre categories

Shifting to the item_feature dataframe:-
* overview
* language (original_language)
* runtime
* vote_average
* vote_count

### Coding  

In [2]:
items_raw=pd.read_csv("data/items_raw.csv")

In [3]:
item_features=pd.read_csv("data/item_features.csv")

1. we are extracting year feature from the release_title

In [4]:
items_raw["release_date"]=pd.to_datetime(items_raw.release_date,infer_datetime_format=True)
items_raw["year"]=items_raw.release_date.apply(lambda x: str(x.year))

2. Drop imdb_url,video_release_date & release_date

In [5]:
items_raw.columns

Index(['movie_id', 'movie_title', 'release_date', 'video_release_date',
       'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-fi', 'Thriller', 'War',
       'Western', 'year'],
      dtype='object')

In [6]:
items_main=items_raw.drop(["IMDb_URL","video_release_date"],axis=1).copy()

In [7]:
items_main.head()

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-fi,Thriller,War,Western,year
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1995


3. Get additional features from the the item_features

In [8]:
items_addtl = item_features[['overview', 'original_language', 'runtime', 'vote_average', 'vote_count', "movie_id"]].copy()

In [9]:
items_addtl

Unnamed: 0,overview,original_language,runtime,vote_average,vote_count,movie_id
0,"Led by Woody, Andy's toys live happily in his ...",en,81,7.9,13765,1
1,When a powerful satellite system falls into th...,en,130,6.9,2722,2
2,It's Ted the Bellhop's first night on the job....,en,98,5.7,1912,3
3,Chili Palmer is a Miami mobster who gets sent ...,en,105,6.5,678,4
4,An agoraphobic psychologist and a female detec...,en,124,6.5,601,5
...,...,...,...,...,...,...
1732,When wistful introvert Alan Furnace meets quic...,en,94,5.9,56,1679
1733,"Gwyneth Paltrow plays London publicist Helen, ...",en,99,6.6,922,1429
1734,"Gwyneth Paltrow plays London publicist Helen, ...",en,99,6.6,922,1680
1735,"Stand up comedy by Martin Lawrence, filmed in ...",en,84,6.6,10,1681


## Merging the two Dataframes

In [9]:
items = pd.merge(left=items_main, right=items_addtl, on="movie_id", how="left")

In [10]:
items.head()

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Sci-fi,Thriller,War,Western,year,overview,original_language,runtime,vote_average,vote_count
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,1995,"Led by Woody, Andy's toys live happily in his ...",en,81.0,7.9,13765.0
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,1,0,0,1995,When a powerful satellite system falls into th...,en,130.0,6.9,2722.0
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,1,0,0,1995,It's Ted the Bellhop's first night on the job....,en,98.0,5.7,1912.0
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,1995,Chili Palmer is a Miami mobster who gets sent ...,en,105.0,6.5,678.0
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,1,0,0,1995,An agoraphobic psychologist and a female detec...,en,124.0,6.5,601.0


In [11]:
items.to_csv("data/items.csv", index=None)

### Using the NLP libraries for Understanding The Function ##

In [12]:
import spacy

In [13]:
from spacy.lang.en.examples import sentences 

In [14]:
import spacy
from spacy.lang.en.examples import sentences 

nlp = spacy.load('en_core_web_lg')
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple is looking at buying U.K. startup for $1 billion
Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [15]:
import spacy

#nlp = spacy.load('en_core_web_md')  # make sure to use larger model!
tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327648
cat dog 0.80168545
cat cat 1.0
cat banana 0.2815437
banana dog 0.24327648
banana cat 0.2815437
banana banana 1.0


In [16]:
import umap
from sklearn.datasets import load_digits

digits = load_digits()

embedding = umap.UMAP().fit_transform(digits.data)