# Feature Engineering

Workflow: 1 

Goal: Load master-movies from database and create features for the algorithm.

Result: The file ```clean_feature.csv``` is created.

In [1]:
import os, sys
import time
import pandas as PD
import numpy as NP

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.py")
import django
django.setup()

'app_proj.settings'

In [4]:
import movies.models.models as MD
import movies.models.analysis as NL

## Load Data

In [5]:
master_df = NL.FeatureEngineer.GetFullDataframe()
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Data columns (total 15 columns):
Movie_ID     17668 non-null int64
Title        17668 non-null object
Year         17668 non-null int64
Rating       8454 non-null object
Companies    16431 non-null object
Country      17627 non-null object
Language     17667 non-null object
RunTime      17664 non-null Int64
Crew         16375 non-null object
Cast         14064 non-null object
Genres       17554 non-null object
Budget       4782 non-null Int64
Gross        5530 non-null Int64
ScoreImdb    17211 non-null float64
VotesImdb    17211 non-null Int64
dtypes: Int64(4), float64(1), int64(2), object(8)
memory usage: 2.1+ MB


In [6]:
# divide into sub-dataframes based on engineering type

identity_df, numeric_df, onehot_df, multihot_df = NL.FeatureEngineer.GetTypeDataframes(master_df)
identity_df.columns
numeric_df.columns
onehot_df.columns
multihot_df.columns
del master_df

Index(['Movie_ID', 'Title'], dtype='object')

Index(['Year', 'RunTime', 'Budget', 'Gross', 'ScoreImdb', 'VotesImdb'], dtype='object')

Index(['Rating', 'Companies', 'Country', 'Language', 'Crew', 'Cast'], dtype='object')

Index(['Genres'], dtype='object')

In [7]:
# get the number of observations (voted movies) to scale the number of different singular values

max_companies, max_crew, max_cast = NL.FeatureEngineer.GetSingularMaximums()
max_companies, max_crew, max_cast

(330, 165, 165)

In [8]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Companies', max_companies)
len(onehot_df['Companies_sng'].value_counts())

322

In [9]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Crew', max_crew)
len(onehot_df['Crew_sng'].value_counts())

166

In [10]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Cast', max_cast)
len(onehot_df['Cast_sng'].value_counts())

165

In [11]:
onehot_df.columns

Index(['Rating', 'Country', 'Language', 'Companies_sng', 'Crew_sng',
       'Cast_sng'],
      dtype='object')

In [12]:
onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language', 'Companies_sng',
       'Crew_sng', 'Cast_sng'], drop_first=True, dummy_na=False, )
len(onehot_dummy_df.columns)

850

In [13]:
# onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language'], drop_first=True, dummy_na=False, )
# onehot_dummy_df = onehot_dummy_df.drop(columns=['Companies_sng', 'Crew_sng', 'Cast_sng'])
# len(onehot_dummy_df.columns)

In [14]:
multihot_df['Genres'].value_counts()

Documentary                           1471
Drama                                 1315
Comedy                                 928
Horror                                 452
Drama, Comedy                          322
                                      ... 
Thriller, Romance, Mystery, Comedy       1
Adventure, Sci-Fi, Action, Comedy        1
Adventure, Comedy, Drama, Fantasy        1
Mystery, Drama, Comedy, Romance          1
Thriller, Crime, Mystery, Drama          1
Name: Genres, Length: 2602, dtype: int64

In [15]:
multihot_dummy_df = NL.FeatureEngineer.MultiHotEncode(multihot_df, 'Genres')
multihot_dummy_df.columns
multihot_dummy_df

Index(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'None', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
       'War', 'Western'],
      dtype='object')

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17663,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17664,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
17665,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
17666,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
# merge sub-dataframes back together

feature_df = PD.concat([identity_df, numeric_df, onehot_dummy_df, multihot_dummy_df], axis=1)
feature_df[1000:1010]
feature_df.info()

Unnamed: 0,Movie_ID,Title,Year,RunTime,Budget,Gross,ScoreImdb,VotesImdb,Rating_PG,Rating_PG-13,...,Horror,Music,Mystery,None,Romance,Sci-Fi,Sport,Thriller,War,Western
1000,540548,Amazed By You,2018,116,600000.0,,5.9,48.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,356199,Amazing Grace,2018,89,,7072258.0,7.5,2125.0,0,0,...,0,1,0,0,0,0,0,0,0,0
1002,300234,Amazonia: A Perilous Journey,2011,91,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,135670,Amber Alert,2012,81,,,4.6,2725.0,0,0,...,1,0,1,0,0,0,0,1,0,0
1004,414067,Amber Alert,2016,90,,,5.7,679.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1005,169730,Amber Lake,2011,81,,,5.7,381.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1006,414509,Ambulance/Gaza,2017,80,,,7.5,59.0,0,1,...,0,0,0,0,0,0,0,0,0,0
1007,126891,Ambush at Tomahawk Gap,1953,73,,,5.9,367.0,0,0,...,0,0,0,0,1,0,0,0,0,1
1008,216541,Ambushed,2013,96,,,3.9,1410.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1009,460228,Amelia: A Tale of Two Sisters,2017,43,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: Int64(4), float64(1), int64(23), object(1), uint8(850)
memory usage: 18.3+ MB


In [17]:
NL.FeatureEngineer.OutputCSV(feature_df)

## Investigate PCA

In [18]:
# PCA should be used since dataset is extremely sparse
# it has to be put into the pipeline, since it fits based on data available
# won't run with NaN's, so impute first

import sklearn.prepro cessing as PP
import sklearn.decomposition as DC
import sklearn.impute as IM

SyntaxError: invalid syntax (<ipython-input-18-2271832db417>, line 5)

In [None]:
feature_only_df = feature_df.drop(columns=['Movie_ID', 'Title'])
feature_only_df.head()

In [None]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
impute_np = imputer.fit_transform(feature_only_df)
impute_np.shape

In [None]:
scaler = PP.StandardScaler()
scale_np = scaler.fit_transform(impute_np)
scale_np.shape

In [None]:
pca = DC.PCA(n_components=0.95, svd_solver='full')
pca.fit(scale_np)
pca.explained_variance_ratio_

In [None]:
pca_df = pca.transform(impute_df)
pca_df.shape

In [None]:
pca_df[:10]

In [None]:
NP.round(pca.components_[0], 4)