# Feature Engineering

Workflow: 1 

Goal: Load master-movies from database and create features for the algorithm.

Result: The file ```clean_feature.csv``` is created.

In [1]:
import os, sys
import time
import pandas as PD
import numpy as NP

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.py")
import django
django.setup()

'app_proj.settings'

In [4]:
import recommend.models.analysis as NL

In [5]:
NL.FeatureEngineer.RunFeatures()

## Load Data

In [6]:
master_df = NL.Feature Engineer.GetFullDataframe()
master_df.info()

SyntaxError: invalid syntax (<ipython-input-6-a70f315a57c3>, line 1)

In [None]:
# divide into sub-dataframes based on engineering type

identity_df, numeric_df, onehot_df, multihot_df = NL.FeatureEngineer.GetTypeDataframes(master_df)
identity_df.columns
numeric_df.columns
onehot_df.columns
multihot_df.columns
del master_df

In [None]:
# get the number of observations (voted movies) to scale the number of different singular values

max_companies, max_crew, max_cast = NL.FeatureEngineer.GetSingularMaximums()
max_companies, max_crew, max_cast

In [None]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Companies', max_companies)
len(onehot_df['Companies_sng'].value_counts())

In [None]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Crew', max_crew)
len(onehot_df['Crew_sng'].value_counts())

In [None]:
onehot_df = NL.FeatureEngineer.SingularColumn(onehot_df, 'Cast', max_cast)
len(onehot_df['Cast_sng'].value_counts())

In [None]:
onehot_df.columns

In [None]:
onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language', 'Companies_sng',
       'Crew_sng', 'Cast_sng'], drop_first=True, dummy_na=False, )
len(onehot_dummy_df.columns)

In [None]:
# onehot_dummy_df = PD.get_dummies(onehot_df, columns=['Rating', 'Country', 'Language'], drop_first=True, dummy_na=False, )
# onehot_dummy_df = onehot_dummy_df.drop(columns=['Companies_sng', 'Crew_sng', 'Cast_sng'])
# len(onehot_dummy_df.columns)

In [None]:
multihot_df['Genres'].value_counts()

In [None]:
multihot_dummy_df = NL.FeatureEngineer.MultiHotEncode(multihot_df, 'Genres')
multihot_dummy_df.columns
multihot_dummy_df

In [None]:
# merge sub-dataframes back together

feature_df = PD.concat([identity_df, numeric_df, onehot_dummy_df, multihot_dummy_df], axis=1)
feature_df[1000:1010]
feature_df.info()

In [None]:
NL.FeatureEngineer.OutputCSV(feature_df)

## Investigate PCA

In [None]:
# PCA should be used since dataset is extremely sparse
# it has to be put into the pipeline, since it fits based on data available
# won't run with NaN's, so impute first

import sklearn.prepro cessing as PP
import sklearn.decomposition as DC
import sklearn.impute as IM

In [None]:
feature_only_df = feature_df.drop(columns=['Movie_ID', 'Title'])
feature_only_df.head()

In [None]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
impute_np = imputer.fit_transform(feature_only_df)
impute_np.shape

In [None]:
scaler = PP.StandardScaler()
scale_np = scaler.fit_transform(impute_np)
scale_np.shape

In [None]:
pca = DC.PCA(n_components=0.95, svd_solver='full')
pca.fit(scale_np)
pca.explained_variance_ratio_

In [None]:
pca_df = pca.transform(impute_df)
pca_df.shape

In [None]:
pca_df[:10]

In [None]:
NP.round(pca.components_[0], 4)