## 1. Load the packages

In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time

## 2. Read the data

In [2]:
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])
X2 = pd.read_csv("X2.csv")

In [3]:
X1

Unnamed: 0.1,Unnamed: 0,title,img_url,description,ratings,n_votes,is_adult,production_year,runtime,genres,release_year,studio,img_embeddings,text_embeddings
0,2502,Letters to Juliet,https://m.media-amazon.com/images/M/MV5BMjg0OT...,Letters to Juliet: Directed by Gary Winick. Wi...,6.5,92937.0,0,2010,105,"Adventure,Comedy,Drama",2010.0,Sum.,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,6238,Veil of Tears,https://m.media-amazon.com/images/M/MV5BZjMxOD...,Veil of Tears: Directed by William Gereghty. W...,7.9,11.0,0,1996,\N,"Action,Crime,Drama",2014.0,WF,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,1800,International Velvet,https://m.media-amazon.com/images/M/MV5BOGVkYj...,International Velvet: Directed by Bryan Forbes...,5.9,1345.0,0,1978,127,"Drama,Family,Sport",1978.0,MGM,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,2675,8 Seconds,https://m.media-amazon.com/images/M/MV5BYjY4Nz...,8 Seconds: Directed by John G. Avildsen. With ...,6.6,4851.0,0,1994,105,"Biography,Drama,Sport",1994.0,NL,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,3674,Penitentiary II,https://m.media-amazon.com/images/M/MV5BNjQyZW...,Penitentiary II: Directed by Jamaa Fanaka. Wit...,4.1,549.0,0,1982,108,"Crime,Drama,Sport",1982.0,MGM,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,2787,Stiff Upper Lips,https://m.media-amazon.com/images/M/MV5BNGRiMz...,Stiff Upper Lips: Directed by Gary Sinyor. Wit...,6.3,973.0,0,1997,94,Comedy,1999.0,Cow.,"[0.19991912, 1.3718543, 1.1531808, 0.052152418...","[-0.55224955, 0.26953417, 0.99950045, -0.98031..."
3536,2319,Twenty Two,https://m.media-amazon.com/images/M/MV5BMjMyMz...,Twenty Two: Directed by Jack Smight. With Barb...,8.0,2190.0,0,1961,25,"Drama,Fantasy,Horror",2017.0,CL,"[0.38660493, 0.81947947, 1.9521054, 0.23831718...","[-0.52293366, 0.35181606, 0.9992165, -0.974448..."
3537,1856,Girlfight,https://m.media-amazon.com/images/M/MV5BMTMzMz...,Girlfight: Directed by Lawrence Trilling. With...,7.6,34.0,0,2001,44,"Drama,Romance",2000.0,SGem,"[0.15014637, 2.0139444, 1.0844889, 0.17270318,...","[-0.61463475, 0.4013893, 0.99981415, -0.985390..."
3538,1531,This Film Is Not Yet Rated,https://m.media-amazon.com/images/M/MV5BMTk0ND...,This Film Is Not Yet Rated: Directed by Kirby ...,7.5,27379.0,0,2006,98,Documentary,2006.0,IFC,"[0.25391683, 3.8994913, 0.23327282, 0.17322594...","[-0.74685824, 0.43955636, 0.99979496, -0.98784..."


## 3. Data Preprocessing

### STEP 1: Data Cleaning (columns drop off and missing value processing)

In [4]:
def data_cleaning_process(df):
    """
    This function will drop columns, like "Unnamed: 0", "title", "img_url", "description" from the dataset, and replace the missing value in `runtime` column with a median value, and replace the missing value in `genres` with "Others".
    :param df: A dataframe (X1 or X2)
    :return: A new cleaned dataframe
    """

    new_df = df.copy()
    # missing value for runtime: replace "\\N" with median value
    median_runtime = np.median(new_df.loc[new_df['runtime'] != '\\N', 'runtime'].astype(np.int64))
    new_df['runtime'] = np.where(new_df['runtime'] == '\\N', median_runtime, new_df['runtime']).astype(np.int64)

    # missing value for genres: replace "\\N" with "Others"
    new_df.loc[new_df['genres'] == "\\N", "genres"] = "Others"

    # drop "Unnamed: 0", "title", "img_url", "description"
    new_df = new_df.drop(["Unnamed: 0", "title", "img_url", "description"], axis=1)

    return new_df

In [5]:
X1_cleaned = data_cleaning_process(X1)

### STEP 2: Data Type Split (Numerical, Categorical, Embeddings)

In [6]:
def data_type_split(df):
    """
    This function will split the whole dataset into different sub dataset according to the data types of the columns
    :param df: A dataframe
    :return: three datadrames, which are numerical, categorical, embeddings
    """

    new_df = df.copy()
    numeric_features = new_df.select_dtypes(include="number").columns.tolist()
    non_numeric_features = new_df.select_dtypes(exclude="number").columns.tolist()
    embedding_features = ['img_embeddings', 'text_embeddings']
    numeric_features.remove('is_adult')
    categorical_features = non_numeric_features.copy()
    [categorical_features.remove(col) for col in embedding_features]
    categorical_features.append('is_adult')
    return new_df.loc[:, numeric_features], new_df.loc[:, categorical_features], new_df.loc[:, embedding_features]

In [7]:
df_num, df_cat, df_emb = data_type_split(X1_cleaned)

In [8]:
df_num.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year
0,6.5,92937.0,2010,105,2010.0
1,7.9,11.0,1996,95,2014.0
2,5.9,1345.0,1978,127,1978.0
3,6.6,4851.0,1994,105,1994.0
4,4.1,549.0,1982,108,1982.0


In [9]:
df_cat.head()

Unnamed: 0,genres,studio,is_adult
0,"Adventure,Comedy,Drama",Sum.,0
1,"Action,Crime,Drama",WF,0
2,"Drama,Family,Sport",MGM,0
3,"Biography,Drama,Sport",NL,0
4,"Crime,Drama,Sport",MGM,0


In [10]:
df_emb.head()

Unnamed: 0,img_embeddings,text_embeddings
0,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."


### STEP 3: Categorical Columns Processing (Genres --> multilable binary type, Studio --> studio_frequency)

In [11]:
dict_cat_freq = torch.load("studio_freq")

In [12]:
mlb = MultiLabelBinarizer()
df_cat['genres_split'] = df_cat['genres'].apply(lambda x: x.split(","))
mlb.fit(df_cat['genres_split'])

MultiLabelBinarizer()

In [13]:
def categorical_process(df):
    """
    This function will process on `genres` and `studio` columns.
    `genres` will be transformed to multilabel binary variables;
    `studio` will be transformed to a frequency type.
    :param df: A categorical datframe
    :return: A new dataframe
    """

    new_df = df.copy()

    # processing on `genres` column
    new_df['genres_split'] = new_df['genres'].apply(lambda x: x.split(","))
    # mlb = MultiLabelBinarizer()
    genere_encoder_df = pd.DataFrame(mlb.transform(new_df['genres_split']))
    genere_encoder_df.columns = mlb.classes_.tolist()

    # processing on `studio` column
    studio_freq_df = pd.DataFrame(new_df['studio'].apply(lambda x: dict_cat_freq[x] if x in dict_cat_freq.keys() else min(dict_cat_freq.values())))
    studio_freq_df.columns = ['studio_freq']


    processed_cat_df = pd.concat([genere_encoder_df, studio_freq_df, new_df['is_adult']], axis=1)

    return processed_cat_df

In [15]:
categorical_process(df_cat).columns

Index(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Others',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War',
       'Western', 'studio_freq', 'is_adult'],
      dtype='object')

### STEP 4: Embedding Column Processing

In [12]:
def listToDF(df, column_name):

    new_df = []
    for row in df[column_name]:
        ls = []
        row = eval(row)
        for each in row:
            ls.append(each)
        new_df.append(ls)

    new_df = pd.DataFrame(new_df)

    return new_df

In [13]:
def embedding_process(df):
    new_df = df.copy()

    # image embeddings
    img_emb_df = listToDF(new_df, 'img_embeddings')
    text_emb_df = listToDF(new_df, 'text_embeddings')

    processed_emb_df = pd.concat([img_emb_df, text_emb_df], axis=1)
    return processed_emb_df

In [14]:
embedding_process(X1.loc[:, ['img_embeddings', 'text_embeddings']])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.250308,2.405846,1.043157,0.030648,0.349497,0.320495,0.901077,0.674996,0.310549,0.533353,...,0.555539,-0.173167,0.399331,-0.378869,-0.409315,-0.344284,-0.073671,0.999885,-0.495468,0.959063
1,0.512502,2.815260,0.463082,0.290314,1.080451,0.333979,0.909705,0.817344,0.340002,0.507425,...,0.534597,-0.320714,0.385184,-0.552606,-0.399212,-0.445087,0.006780,0.999800,-0.508911,0.954674
2,0.180730,0.247355,0.636528,0.249652,0.100523,0.055299,0.850117,0.015236,0.232086,0.558831,...,0.697693,-0.035854,0.356325,-0.388621,-0.344016,-0.322001,-0.085762,0.999952,-0.458579,0.976136
3,0.025015,0.910534,0.387826,0.342125,0.449293,0.517494,0.213289,0.463569,0.863764,0.544507,...,0.692328,-0.009719,0.076474,-0.214084,-0.177248,0.025969,-0.112050,0.999954,-0.054416,0.977869
4,0.190797,1.906828,0.291144,0.195275,0.525999,0.502706,0.359901,0.074025,0.379786,0.182895,...,0.593455,-0.336046,0.406313,-0.384353,-0.334238,-0.377167,-0.131084,0.999879,-0.166202,0.951990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,0.199919,1.371854,1.153181,0.052152,0.536693,0.082999,0.327779,0.067693,0.368952,0.440536,...,0.618991,-0.185828,0.461786,-0.595909,-0.495862,-0.498804,0.012033,0.999901,-0.685967,0.966164
3536,0.386605,0.819479,1.952105,0.238317,0.502970,0.193758,1.336698,0.286949,0.489620,0.567320,...,0.372036,-0.195845,0.200071,-0.582929,-0.504679,-0.504943,-0.131687,0.999818,-0.662812,0.968601
3537,0.150146,2.013944,1.084489,0.172703,0.609013,0.387161,0.484712,0.408787,0.177421,0.471901,...,0.567243,-0.106148,0.392279,-0.381554,-0.285943,-0.322065,-0.178053,0.999946,-0.321442,0.979969
3538,0.253917,3.899491,0.233273,0.173226,1.474815,0.347801,0.720919,0.262827,0.411901,0.196077,...,0.739504,-0.176527,0.220067,-0.451745,-0.179833,-0.007636,-0.181650,0.999941,-0.309001,0.972790


### STEP 5: Combine Everything

In [15]:
def data_combine(df_num, df_cat, df_emb):
    new_df = pd.concat([df_num, df_cat, df_emb], axis=1)
    return new_df

In [16]:
df_processed = data_combine(df_num, categorical_process(df_cat), df_emb)

In [17]:
df_processed

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Adventure,Animation,Biography,Comedy,...,Sci-Fi,Short,Sport,Thriller,War,Western,studio_freq,is_adult,img_embeddings,text_embeddings
0,6.5,92937.0,2010,105,2010.0,0,1,0,0,1,...,0,0,0,0,0,0,0.001695,0,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,7.9,11.0,1996,95,2014.0,1,0,0,0,0,...,0,0,0,0,0,0,0.000565,0,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,5.9,1345.0,1978,127,1978.0,0,0,0,0,0,...,0,0,1,0,0,0,0.025141,0,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,6.6,4851.0,1994,105,1994.0,0,0,0,1,0,...,0,0,1,0,0,0,0.016949,0,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,4.1,549.0,1982,108,1982.0,0,0,0,0,0,...,0,0,1,0,0,0,0.025141,0,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,6.3,973.0,1997,94,1999.0,0,0,0,0,1,...,0,0,0,0,0,0,0.002825,0,"[0.19991912, 1.3718543, 1.1531808, 0.052152418...","[-0.55224955, 0.26953417, 0.99950045, -0.98031..."
3536,8.0,2190.0,1961,25,2017.0,0,0,0,0,0,...,0,0,0,0,0,0,0.001130,0,"[0.38660493, 0.81947947, 1.9521054, 0.23831718...","[-0.52293366, 0.35181606, 0.9992165, -0.974448..."
3537,7.6,34.0,2001,44,2000.0,0,0,0,0,0,...,0,0,0,0,0,0,0.005650,0,"[0.15014637, 2.0139444, 1.0844889, 0.17270318,...","[-0.61463475, 0.4013893, 0.99981415, -0.985390..."
3538,7.5,27379.0,2006,98,2006.0,0,0,0,0,0,...,0,0,0,0,0,0,0.026836,0,"[0.25391683, 3.8994913, 0.23327282, 0.17322594...","[-0.74685824, 0.43955636, 0.99979496, -0.98784..."


### STEP 6: Normalization and Standarization

In [18]:
def data_scaling(df):
    """
    This function will process on the numercial columns.
    For `ratings`, we will use normalization
    For the other columns, we will use standardization
    :param df:
    :return:
    """
    new_X = df.copy().to_numpy()
    # new_df = df.copy()

    # df_norm = new_df["ratings"]
    # df_stad = new_df.iloc[:, 1:]
    scaler_norm = MinMaxScaler().fit(new_X[:, 0].reshape([-1, 1]))
    scaler_stad = StandardScaler().fit(new_X[:, 1:5])
    new_X[:, 0] = scaler_norm.transform(new_X[:, 0].reshape([-1, 1])).ravel()
    new_X[:, 1:5] = scaler_stad.transform(new_X[:, 1:5])

    return new_X

In [19]:
data_scaling(df_processed)

array([[0.6067415730337079, 0.8091997164658064, 1.1113535552086757, ...,
        0,
        '[0.25030804, 2.4058464, 1.0431569, 0.030648155, 0.34949675, 0.32049546, 0.9010769, 0.67499596, 0.3105492, 0.53335327, 1.1203631, 1.1870352, 0.16469067, 0.08631029, 0.2536717, 0.8805131, 0.99093795, 0.5192096, 0.22073326, 1.1222858, 0.16405053, 0.43223396, 0.13318907, 0.15435342, 0.6851507, 0.29961434, 0.1571368, 0.36041096, 0.35249132, 0.41207317, 0.10110004, 0.8365802, 0.5149405, 0.956604, 0.9495678, 2.4414942, 0.3832077, 1.360116, 0.5607003, 0.21099378, 0.3635454, 0.9016296, 0.21133274, 0.80351084, 0.07196415, 0.7504479, 0.7727491, 0.6297995, 0.29605088, 0.6241131, 0.31069583, 0.47231564, 0.16313902, 0.20088163, 0.5571188, 0.38998926, 0.6070499, 0.28628415, 0.2539219, 0.7498281, 0.43571132, 0.5879961, 0.8000516, 0.35587543, 0.09601559, 0.13888705, 0.5510557, 0.22644973, 0.43565476, 0.5604339, 1.297827, 1.6763825, 0.26593068, 0.94142264, 1.0117874, 1.1187711, 0.62566483, 0.2033474, 0.4790194, 

### Construct Data Engineering Pipeline

In [20]:
def DataEngineering(df):
    df_cleaned = data_cleaning_process(df)
    df_num, df_cat, df_emb = data_type_split(df_cleaned)
    df_cat_processed = categorical_process(df_cat)
    df_emb_processed = embedding_process(df_emb)
    df_processed = data_combine(df_num, df_cat_processed, df_emb_processed)
    X_ready = data_scaling(df_processed)

    return X_ready

In [21]:
preprocess_transformer = FunctionTransformer(DataEngineering)

In [22]:
p1 = Pipeline([
    ('Preprocessor', preprocess_transformer)
])

In [23]:
torch.save(p1, "preprocessor")

In [24]:
X1_ready = p1.fit_transform(X1)

In [25]:
X1_ready

array([[ 0.60674157,  0.80919972,  1.11135356, ...,  0.99988544,
        -0.49546754,  0.95906293],
       [ 0.76404494, -0.27177631, -0.07338902, ...,  0.9998001 ,
        -0.5089115 ,  0.9546743 ],
       [ 0.53932584, -0.25625834, -1.59662947, ...,  0.9999524 ,
        -0.45857945,  0.9761356 ],
       ...,
       [ 0.73033708, -0.27150875,  0.34973333, ...,  0.99994576,
        -0.3214418 ,  0.97996914],
       [ 0.71910112,  0.04658617,  0.77285568, ...,  0.9999413 ,
        -0.3090013 ,  0.9727902 ],
       [ 0.33707865, -0.24519569,  0.26510886, ...,  0.9998354 ,
        -0.75486994,  0.9661582 ]])

In [26]:
torch.save(X1_ready, 'X1_ready')

In [27]:
len(eval(X1.img_embeddings[0]))

2048

In [28]:
len(eval(X1.text_embeddings[0]))

768

In [29]:
X1

Unnamed: 0.1,Unnamed: 0,title,img_url,description,ratings,n_votes,is_adult,production_year,runtime,genres,release_year,studio,img_embeddings,text_embeddings
0,2502,Letters to Juliet,https://m.media-amazon.com/images/M/MV5BMjg0OT...,Letters to Juliet: Directed by Gary Winick. Wi...,6.5,92937.0,0,2010,105,"Adventure,Comedy,Drama",2010.0,Sum.,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,6238,Veil of Tears,https://m.media-amazon.com/images/M/MV5BZjMxOD...,Veil of Tears: Directed by William Gereghty. W...,7.9,11.0,0,1996,\N,"Action,Crime,Drama",2014.0,WF,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,1800,International Velvet,https://m.media-amazon.com/images/M/MV5BOGVkYj...,International Velvet: Directed by Bryan Forbes...,5.9,1345.0,0,1978,127,"Drama,Family,Sport",1978.0,MGM,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,2675,8 Seconds,https://m.media-amazon.com/images/M/MV5BYjY4Nz...,8 Seconds: Directed by John G. Avildsen. With ...,6.6,4851.0,0,1994,105,"Biography,Drama,Sport",1994.0,NL,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,3674,Penitentiary II,https://m.media-amazon.com/images/M/MV5BNjQyZW...,Penitentiary II: Directed by Jamaa Fanaka. Wit...,4.1,549.0,0,1982,108,"Crime,Drama,Sport",1982.0,MGM,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,2787,Stiff Upper Lips,https://m.media-amazon.com/images/M/MV5BNGRiMz...,Stiff Upper Lips: Directed by Gary Sinyor. Wit...,6.3,973.0,0,1997,94,Comedy,1999.0,Cow.,"[0.19991912, 1.3718543, 1.1531808, 0.052152418...","[-0.55224955, 0.26953417, 0.99950045, -0.98031..."
3536,2319,Twenty Two,https://m.media-amazon.com/images/M/MV5BMjMyMz...,Twenty Two: Directed by Jack Smight. With Barb...,8.0,2190.0,0,1961,25,"Drama,Fantasy,Horror",2017.0,CL,"[0.38660493, 0.81947947, 1.9521054, 0.23831718...","[-0.52293366, 0.35181606, 0.9992165, -0.974448..."
3537,1856,Girlfight,https://m.media-amazon.com/images/M/MV5BMTMzMz...,Girlfight: Directed by Lawrence Trilling. With...,7.6,34.0,0,2001,44,"Drama,Romance",2000.0,SGem,"[0.15014637, 2.0139444, 1.0844889, 0.17270318,...","[-0.61463475, 0.4013893, 0.99981415, -0.985390..."
3538,1531,This Film Is Not Yet Rated,https://m.media-amazon.com/images/M/MV5BMTk0ND...,This Film Is Not Yet Rated: Directed by Kirby ...,7.5,27379.0,0,2006,98,Documentary,2006.0,IFC,"[0.25391683, 3.8994913, 0.23327282, 0.17322594...","[-0.74685824, 0.43955636, 0.99979496, -0.98784..."


In [30]:
X2_ready = p1.fit_transform(X2)

In [31]:
X1_ready[:5, :]

array([[ 0.60674157,  0.80919972,  1.11135356, ...,  0.99988544,
        -0.49546754,  0.95906293],
       [ 0.76404494, -0.27177631, -0.07338902, ...,  0.9998001 ,
        -0.5089115 ,  0.9546743 ],
       [ 0.53932584, -0.25625834, -1.59662947, ...,  0.9999524 ,
        -0.45857945,  0.9761356 ],
       [ 0.61797753, -0.21547426, -0.24263796, ...,  0.99995446,
        -0.05441597,  0.97786885],
       [ 0.33707865, -0.26551794, -1.25813159, ...,  0.99987864,
        -0.16620223,  0.95198965]])

In [32]:
X2_ready[:5, :]

array([[ 0.7       , -0.01345092,  0.5264698 , ...,  0.99991196,
        -0.4991867 ,  0.97200704],
       [ 0.5       , -0.22142813, -2.54367057, ...,  0.9999604 ,
        -0.21902409,  0.9811001 ],
       [ 0.55      , -0.22178896, -3.0947214 , ...,  0.9998148 ,
        -0.6797892 ,  0.9658753 ],
       [ 0.75      , -0.22115321, -1.91389819, ...,  0.9997429 ,
        -0.6106547 ,  0.93203384],
       [ 0.4875    , -0.21016499, -0.18202413, ...,  0.99988383,
        -0.52106243,  0.9676674 ]])

In [35]:
torch.save(X2_ready, 'X2_ready')