In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


In [2]:
%%time
df = pd.read_csv('genre_300_sample_plot.csv', index_col='id_of_the_film')

Wall time: 1min 44s


In [3]:
def split_features_label(data_df):
    return data_df.drop(columns=['genre_of_the_film']), data_df['genre_of_the_film']

In [4]:
def scale_split(df):
    X, y = split_features_label(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=42)

    X_tr = pd.concat([X_train, y_train], axis=1)
    X_tr.to_csv("movies_for_svd_train.csv", index_label='id_of_the_film')
    X_te = pd.concat([X_test, y_test], axis=1)
    X_te.to_csv("movies_for_svd_test.csv", index_label='id_of_the_film')
    
    scaler = MinMaxScaler().fit(X_train)
    X_train_sc = scaler.transform(X_train)
    X_test_sc = scaler.transform(X_test)
    X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.columns, index=X_train.index)
    X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns, index=X_test.index)
    return X_train_sc, X_test_sc, y_train, y_test

In [5]:
def build_encoder_films(encoding_dim, X_tr, X_te):
    ncol = X_tr.shape[1]
    input_dim = Input(shape = (ncol, ))
    encoded1 = Dense(5000, activation = 'relu')(input_dim)
    encoded2 = Dense(encoding_dim, activation = 'relu')(encoded1)
    decoded1 = Dense(5000, activation = 'relu')(encoded2)
    output = Dense(ncol, activation = 'sigmoid')(decoded1)
    autoencoder = Model(inputs = input_dim, outputs = output)
    autoencoder.compile(optimizer = 'adadelta', loss = 'mean_squared_error')
    autoencoder.summary()
    autoencoder.fit(X_tr, X_tr, nb_epoch = 1, batch_size = 100, shuffle = False, validation_data = (X_te, X_te))
    encoder = Model(inputs = input_dim, outputs = encoded2)
    return encoder

In [6]:
def encode_train_test_films_to_csv(encoder, X_train_sc, X_test_sc, y_train, y_test, encod_dim):
    X_train_encoded = pd.DataFrame(encoder.predict(X_train_sc)).add_prefix('feature_')
    X_train_encoded.index = X_train_sc.index
    X_train_fin = pd.concat([X_train_encoded, y_train], axis=1)
    X_train_fin.to_csv("train_compressed_to_" + str(encod_dim) + "_features.csv", index_label='id_of_the_film')

    X_test_encoded = pd.DataFrame(encoder.predict(X_test_sc)).add_prefix('feature_')
    X_test_encoded.index = X_test_sc.index
    X_test_fin = pd.concat([X_test_encoded, y_test], axis=1)
    X_test_fin.to_csv("test_compressed_to_" + str(encod_dim) + "_features.csv", index_label='id_of_the_film')
    return 'hoi'

In [7]:
X_train_sc, X_test_sc, y_train, y_test = scale_split(df)

  return self.partial_fit(X, y)


In [8]:
X_train_ae, X_test_ae = train_test_split(X_train_sc, train_size = 0.85, random_state=42)



In [9]:
encod_dim = 500

In [10]:
%%time
encoder = build_encoder_films(encod_dim, X_train_ae, X_test_ae)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50005)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 5000)              250030000 
_________________________________________________________________
dense_2 (Dense)              (None, 500)               2500500   
_________________________________________________________________
dense_3 (Dense)              (None, 5000)              2505000   
_________________________________________________________________
dense_4 (Dense)              (None, 50005)             250075005 
Total params: 505,110,505
Trainable params: 505,110,505
Non-trainable params: 0
_________________________________________________________________


  # This is added back by InteractiveShellApp.init_path()


Train on 2856 samples, validate on 504 samples
Epoch 1/1
Wall time: 6min 7s


In [None]:
encode_train_test_films_to_csv(encoder, X_train_sc, X_test_sc, y_train, y_test, encod_dim)