# Feature Engineering

Workflow: 7 

Goal: Load master-movies from database and create features for the algorithm.

Result: The file ```clean_feature.csv``` is created.

In [1]:
import os, sys
import time
import pandas as PD
import numpy as NP

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.py")
import django
django.setup()

'app_proj.settings'

In [4]:
import movies.models.models as MD
import movies.models.analysis as NL

## Load Data

In [5]:
master_df = NL.FeatureEngineer.GetFullDataframe()
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Data columns (total 15 columns):
Movie_ID     17668 non-null int64
Title        17668 non-null object
Year         17668 non-null int64
Rating       8454 non-null object
Companies    16431 non-null object
Country      17627 non-null object
Language     17667 non-null object
RunTime      17664 non-null Int64
Crew         16375 non-null object
Cast         14064 non-null object
Genres       17554 non-null object
Budget       4782 non-null Int64
Gross        5530 non-null Int64
ScoreImdb    17211 non-null float64
VotesImdb    17211 non-null Int64
dtypes: Int64(4), float64(1), int64(2), object(8)
memory usage: 2.1+ MB


In [6]:
# divide into sub-dataframes based on engineering type

identity_df, numeric_df, onehot_df, multihot_df = NL.FeatureEngineer.GetTypeDataframes(master_df)
identity_df.columns
numeric_df.columns
onehot_df.columns
multihot_df.columns
del master_df

Index(['Movie_ID', 'Title'], dtype='object')

Index(['Year', 'RunTime', 'Budget', 'Gross', 'ScoreImdb', 'VotesImdb'], dtype='object')

Index(['Rating', 'Companies', 'Country', 'Language', 'Crew', 'Cast'], dtype='object')

Index(['Genres'], dtype='object')

In [7]:
# get the number of observations (voted movies) to scale the number of different singular values

max_companies, max_crew, max_cast = NL.FeatureEngineer.GetSingularMaximums()
max_companies, max_crew, max_cast

(200, 100, 100)

In [8]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Companies', max_companies)
len(onehot_df['Companies_sng'].value_counts())

197

In [9]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Crew', max_crew)
len(onehot_df['Crew_sng'].value_counts())

101

In [10]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Cast', max_cast)
len(onehot_df['Cast_sng'].value_counts())

101

In [11]:
onehot_df.columns

Index(['Rating', 'Country', 'Language', 'Companies_sng', 'Crew_sng',
       'Cast_sng'],
      dtype='object')

In [12]:
onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language', 'Companies_sng',
       'Crew_sng', 'Cast_sng'], drop_first=True, dummy_na=False, )
len(onehot_dummy_df.columns)

596

In [13]:
multihot_df['Genres'].value_counts()

Documentary                          1471
Drama                                1315
Comedy                                928
Horror                                452
Horror, Thriller                      358
                                     ... 
Drama, Adventure, History, Action       1
Comedy, Horror, Mystery                 1
Comedy, Horror, Romance, Drama          1
Romance, Action, Comedy, Drama          1
Western, Comedy, Adventure              1
Name: Genres, Length: 2618, dtype: int64

In [14]:
multihot_dummy_df = NL.FeatureEngineer.MultiHotEncode(multihot_df, 'Genres')
multihot_dummy_df.columns
multihot_dummy_df

Index(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'None', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
       'War', 'Western'],
      dtype='object')

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17663,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
17664,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
17665,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
17666,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# merge sub-dataframes back together

feature_df = PD.concat([identity_df, numeric_df, onehot_dummy_df, multihot_dummy_df], axis=1)
feature_df[1000:1010]
feature_df.info()

Unnamed: 0,Movie_ID,Title,Year,RunTime,Budget,Gross,ScoreImdb,VotesImdb,Rating_PG,Rating_PG-13,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
1000,283311,Amit Sahni Ki List,2014,110,,,5.5,329,0,0,...,0,0,0,0,1,0,0,0,0,0
1001,41671,Amityville 1992: It's About Time,1992,95,,,4.5,2107,0,0,...,1,0,0,0,0,0,0,0,0,0
1002,33519,Amityville: A New Generation,1993,91,1500000.0,,3.7,1535,0,0,...,1,0,0,0,0,0,0,1,0,0
1003,16235,Amityville II: The Possession,1982,104,5000000.0,12534817.0,5.5,9451,0,0,...,1,0,0,0,0,0,0,0,0,0
1004,225866,Amnesiac,2013,90,,,2.5,109,0,0,...,1,0,0,0,0,0,0,1,0,0
1005,351043,Amnesiac,2015,90,3000000.0,61192.0,4.3,3230,0,0,...,1,0,1,0,0,0,0,1,0,0
1006,169904,Amnesty,2012,83,,,6.3,182,0,0,...,0,0,0,0,0,0,0,0,0,0
1007,160862,A Model Daughter: The Killing of Caroline Byrne,2009,94,,,6.4,313,0,0,...,0,0,0,0,0,0,0,0,0,0
1008,73107,Among Dead Men,2008,94,,,4.0,120,0,0,...,1,0,0,0,0,0,0,1,0,0
1009,274480,Among Ravens,2014,103,,,4.6,319,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 625 entries, Movie_ID to Western
dtypes: Int64(4), float64(1), int64(23), object(1), uint8(596)
memory usage: 14.0+ MB


In [16]:
NL.FeatureEngineer.OutputCSV(feature_df)

## Investigate PCA

In [21]:
# PCA should be used since dataset is extremely sparse
# it has to be put into the pipeline, since it fits based on data available
# won't run with NaN's, so impute first

import sklearn.decomposition as DC
import sklearn.impute as IM

In [22]:
feature_only_df = feature_df.drop(columns=['Movie_ID', 'Title'])

In [27]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
impute_df = imputer.fit_transform(feature_only_df)
impute_df.shape

(17668, 623)

In [37]:
pca = DC.PCA(n_components=0.999, svd_solver='full')
pca.fit(impute_df)
pca.explained_variance_ratio_

PCA(copy=True, iterated_power='auto', n_components=0.999, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

array([0.96749529, 0.03250402])

In [38]:
pca_df = pca.transform(impute_df)
pca_df.shape

(17668, 2)