# Feature Engineering

Workflow: 7 

Goal: Load master-movies from database and create features for the algorithm.

Result: The file ```movie_feature.csv``` is created.

In [1]:
import os, sys
import time
import pandas as PD
import numpy as NP

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.py")
import django
django.setup()

'app_proj.settings'

In [4]:
import movies.models.analysis as NL

## Load Data

In [5]:
master_df = NL.FeatureEngineer.GetFullDataframe()
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 15 columns):
Movie_ID     319 non-null int64
Title        319 non-null object
Year         319 non-null int64
Rating       149 non-null object
Companies    298 non-null object
Country      319 non-null object
Language     319 non-null object
RunTime      319 non-null int64
Crew         301 non-null object
Cast         256 non-null object
Genres       318 non-null object
Budget       84 non-null float64
Gross        102 non-null float64
ScoreImdb    312 non-null float64
VotesImdb    312 non-null float64
dtypes: float64(4), int64(3), object(8)
memory usage: 37.5+ KB


In [6]:
# divide into sub-dataframes based on engineering type

identity_df, numeric_df, onehot_df, multihot_df = NL.FeatureEngineer.GetTypeDataframes(master_df)
identity_df.columns
numeric_df.columns
onehot_df.columns
multihot_df.columns
del master_df

Index(['Movie_ID', 'Title'], dtype='object')

Index(['Year', 'RunTime', 'Budget', 'Gross', 'ScoreImdb', 'VotesImdb'], dtype='object')

Index(['Rating', 'Companies', 'Country', 'Language', 'Crew', 'Cast'], dtype='object')

Index(['Genres'], dtype='object')

In [7]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Companies')
#onehot_df['Companies_sng'].value_counts()

In [8]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Crew')
#onehot_df['Crew_sng'].value_counts()

In [9]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Cast')
#onehot_df['Cast_sng'].value_counts()

In [10]:
onehot_df.columns

Index(['Rating', 'Country', 'Language', 'Companies_sng', 'Crew_sng',
       'Cast_sng'],
      dtype='object')

In [11]:
onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language', 'Companies_sng',
       'Crew_sng', 'Cast_sng'], drop_first=True, dummy_na=False, )
len(onehot_dummy_df.columns)

97

In [12]:
multihot_df['Genres'].value_counts()

Drama                      31
Documentary                26
Comedy                     16
Drama, Comedy              13
Horror                      8
                           ..
Action, Adventure           1
Fantasy, Romance, Drama     1
Documentary, Biography      1
Romance, Comedy, Drama      1
Comedy, Drama, Crime        1
Name: Genres, Length: 172, dtype: int64

In [13]:
multihot_dummy_df = NL.FeatureEngineer.MultiHotEncode(multihot_df, 'Genres')
multihot_dummy_df.columns
multihot_dummy_df

Index(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'None', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
       'War', 'Western'],
      dtype='object')

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
315,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
316,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
317,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [14]:
# merge sub-dataframes back together

feature_df = PD.concat([identity_df, numeric_df, onehot_dummy_df, multihot_dummy_df], axis=1)
feature_df.head()
feature_df.info()

Unnamed: 0,Movie_ID,Title,Year,RunTime,Budget,Gross,ScoreImdb,VotesImdb,Rating_PG,Rating_PG-13,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
0,217316,1,2013,112,,,8.0,3535.0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,386666,13 Times Evil,2016,90,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
2,341957,21 Days,2014,89,,,4.5,493.0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,441453,36 Hour Layover,2016,88,1000000.0,,5.8,74.0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,270802,666: Devilish Charm,2014,78,1000000.0,,3.4,110.0,0,0,...,1,0,0,0,0,0,0,1,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Columns: 126 entries, Movie_ID to Western
dtypes: float64(4), int64(24), object(1), uint8(97)
memory usage: 102.6+ KB


In [15]:
NL.FeatureEngineer.OutputCSV(feature_df)

In [16]:
import movies.models.models as MD
MD.Editor.CreateSyntheticVotes()
MD.UserVotes.objects.all().count()

Spike (2008) 3.2 Fantasy, Romance, Horror, Sci-Fi : 1
Uuf Kya Jadoo Mohobbat Hai (2004) 4.9 Romance, Comedy : 1
The Deal (2005) 5.0 Thriller, Drama, Crime : 1
The Pig Who Cried Werewolf (2011) 5.9 Animation, Comedy : 1
Memoir of War (2017) 6.1 Drama : 1
Appetite (1998) 4.3 Thriller, Mystery, Horror : 1
Post impact (2004) 3.2 Adventure, Sci-Fi, Action : 2
Svengali (1931) 6.8 Drama, Romance, Horror : 1
Honor and Glory (1993) 4.2 Action : 1
Chandni Bar (2001) 7.6 Crime, Drama : 1


10