# Lets create a Recommendation system for the Henry coefficients of porous materials

In [2]:
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pprint
import os
import pygaps
import scipy
from scipy.sparse.linalg import svds

## We will use the Henry Coefficient matrix, created in `HenryMatrix.ipynb`

In [3]:
henry_df = pd.read_csv('data/henry_matrix_df.csv')
henry_df.set_index(list(henry_df)[0], inplace=True)
henry_df.head(10)

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,QQONPFPTGQHPMA-UHFFFAOYSA-N_298K,ATUOYWHBWRKTHZ-UHFFFAOYSA-N_298K,HSFWRNGVRCDJHI-UHFFFAOYSA-N_298K,UFHFLCQGNIYNRP-UHFFFAOYSA-N_298K,MWUXSHHQAYIFBG-UHFFFAOYSA-N_298K,XLYOFNOQVPJJNP-UHFFFAOYSA-N_298K,...,VGGSQFUCUMXWEO-UHFFFAOYSA-N_298K,QGZKDVFQNNGYKY-UHFFFAOYSA-N_298K,CSCPPACGZOOCGX-UHFFFAOYSA-N_298K,IJDNQMDRQITEOD-UHFFFAOYSA-N_298K,NNPPMTNAJDCUHE-UHFFFAOYSA-N_298K,BDERNNFJNOPAEC-UHFFFAOYSA-N_298K,YXFVVABEGXRONW-UHFFFAOYSA-N_298K,OFBQJSOFQDEBGM-UHFFFAOYSA-N_298K,RAHZWNYVWXNFOC-UHFFFAOYSA-N_298K,VXNZUUAINFGPBY-UHFFFAOYSA-N_298K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,0.627878,0.208338,0.240586,,13.210824,15.439162,0.001502,3.488718,,9.316021,...,1.370187,0.958055,,,,,,,18.192574,
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,14.439543,0.613339,0.770274,0.096864,52.095266,80.817935,,,,,...,3.11604,442.583688,,257.522347,,,,,,97.061748
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,0.982443,0.286726,0.058081,0.188162,,,,3.694219,,2.344581,...,40.043567,1.40946,1.462035,,,,,,,
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.198198,0.206994,0.735486,0.097778,,,,,,,...,,66.204409,,,,,,,,
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,4.358242,0.250075,0.606177,0.238141,478.099058,25.5288,86.774658,0.077938,10.083501,2315.49944,...,36.576317,362.456884,,0.267295,386.886107,,,,34.533257,
NIST-MATDB-acbbe796d49416e999dc819f2ae65fe9,3.040278,,14.129264,,,,51.060147,7.839395,,,...,,,,,,,,,55.104883,
NIST-MATDB-2c9380bddc59f547484ff729eab7d7cb,1.015902,0.175303,0.351781,,,0.219119,1.15668,0.072082,0.171605,0.306919,...,,37.412808,,,,,,,16.823514,
NIST-MATDB-5ab0dbe0639729711750ef4b97715f0f,2.449089,0.195309,0.599994,0.122395,,,,0.720796,,493.780606,...,,,,,,,,,,
NIST-MATDB-0d52b4312e8dd98a53fcfaba14cd0e82,,,,,124.576242,88.002547,,1.429994,,52.073153,...,,,,224.890244,,,,,,125.484691
NIST-MATDB-b848f05ba9fb26cdd50755863b377c56,3.401754,0.415943,0.818044,,,,,0.008472,,0.709734,...,,,,,,,,,24.879337,


## Lets also define a function that will randomly split our data into <i>k</i> folds

In [None]:
def k_fold_split(henry_df, k, max_iter=1000):
    assert k > 1, 'Choose a `k` greater than 1, otherwise this function is pointless'
    matrix = henry_df.values
    fold_matrix = np.full_like(matrix, np.nan, dtype=np.double)
    col_bool = True
    n_col = 0
    while col_bool:
        n_col += 1
        col_bool = False
        for i_row in range(matrix.shape[0]):
            # Grabs all available indices in row `i_row`
            j_indices = np.where(~np.isnan(matrix[i_row,:]))[0]
            # As long as there are enough data points to distribute between `k` folds, we'll equally distribute them.
            while len(j_indices) >= k:
                for fold in range(k):
                    j = np.random.randint(0, len(j_indices))
                    fold_matrix[i_row, j_indices[j]] = fold
                    j_indices = np.delete(j_indices, j)
            # If there are leftover data points (i.e. #data points left < `k`), we'll randomly assign them into folds
            for j in j_indices:
                fold_matrix[i_row, j] = np.int(np.random.rand()*k)
        # We make sure we have enough data points in the columns as well
        for j_col in range(matrix.shape[1]):
            vals = np.unique(fold_matrix[:, j_col])
            if len(vals[~np.isnan(vals)]) < k:
                col_bool = True
                break
        # If too many iteration are performed an Exception is raised.
        if n_col > max_iter:
            raise Exception('Maximum number of iterations reached. Try changing the `max_iter` argument.')
    
    print('Number of iterations required to split data into {}-folds: {}'.format(k, n_col))
    return fold_matrix

fold_matrix = k_fold_split(henry_df, 5, max_iter=100000)

In [153]:
henry_df.mask(fold_matrix==0).values

array([[        nan,  0.20833811,         nan, ...,         nan,
        18.19257397,         nan],
       [14.43954267,         nan,  0.77027446, ...,         nan,
                nan,         nan],
       [        nan,  0.28672616,         nan, ...,         nan,
                nan,         nan],
       ...,
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])

### Lets start setting up the Recommender System algorithm

In [155]:
def recsys_rmse(r, p, q, lambd):
    return (r - np.transpose(q) * p)**2 + lambd * (np.linalg.norm(p)**2 + np.linalg.norm(q)**2)

#### ALS methods implemented according to http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf

In [None]:
def ALS(henry_matrix, threshold):
    rmse = np.inf
    while rmse < threshold:
        for indices in np.where(~np.isnan(henry_matrix)):
            