# Lets create a Recommendation system for the Henry coefficients of porous materials

In [102]:
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pprint
import os
import pygaps
import scipy
from scipy.sparse.linalg import svds

## We will use the Henry Coefficient matrix, created in `HenryMatrix.ipynb`

In [14]:
henry_df = pd.read_csv('henry_matrix_median.csv')
henry_df.set_index(list(henry_df)[0], inplace=True)
henry_df[henry_df.columns[-5:]][-5:]

Unnamed: 0_level_0,URLKBWYHVLBVBO-UHFFFAOYSA-N_298K,RAHZWNYVWXNFOC-UHFFFAOYSA-N_298K,SWQJXJOGLNCZEY-UHFFFAOYSA-N_298K,VXNZUUAINFGPBY-UHFFFAOYSA-N_298K,QWTDNUCVQCZILF-UHFFFAOYSA-N_298K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NIST-MATDB-d049583f6d607e59dc804c4029176dcb,,,,,
NIST-MATDB-8fad47f65fa08c7ac5157aa92c218b1d,,,,,
NIST-MATDB-b16ee19c4a2d5d7d2f7e09cb7263392f,,,,,
NIST-MATDB-655315f090663b4723680a6683f06403,,,,,
NIST-MATDB-7fdf5997d47b59af2318c9cf774e49ca,,,,,


## Lets also define a function that will randomly split our data into a testing and training set

In [98]:
# Will create a mask that will split the sparse matrix into two matrices. 
#  `train_frac` is the fraction of the data (excluding the NaN's) that will be used for training.
#  the fraction of the data used for testing will be `1 - train_frac`.
#
# train_matrix, test_matrix = split_matrix(henry_matrix, 0.8)
def split_matrix(henry_matrix, train_frac):
    matrix_size = np.size(henry_matrix)
    matrix_nan_size = np.sum(np.sum(np.isnan(henry_matrix)))
    henry_data_size = matrix_size - matrix_nan_size
    train_mask = np.zeros(henry_matrix.shape, dtype=bool)
    n_retained = np.floor(train_frac * henry_data_size)
    n_current = 0
    while True:
        i_material = random.choice(np.arange(len(henry_matrix.index)))
        j_gas = random.choice(np.arange(len(henry_matrix.columns)))
        if not np.isnan(henry_matrix.iat[i_material, j_gas]) and not train_mask[i_material, j_gas]:
            train_mask[i_material, j_gas] = True
            n_current += 1
        if n_current >= n_retained:
            break
    train_matrix = henry_matrix.mask(train_mask)
    test_matrix = henry_matrix.mask(~train_mask)
    #print(henry_data_size)
    #print(np.sum(np.sum(~np.isnan(train_matrix))) + np.sum(np.sum(~np.isnan(test_matrix))))
    assert henry_data_size == np.sum(np.sum(~np.isnan(train_matrix))) + np.sum(np.sum(~np.isnan(test_matrix)))
    return train_matrix, test_matrix
tr_m, te_m = split_matrix(henry_df, 0.8)

In [99]:
np.sum(np.sum(np.isnan(tr_m)))

11144

In [100]:
henry_df[henry_df.columns[:5]][:5]

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_77K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,0.643047,0.077438,0.239119,,1672.743597
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,73.595342,0.303794,0.770303,0.0910124,3379.72946
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,1.76034,0.271928,0.32673,2e-10,1227.679975
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.172201,0.200809,0.690172,0.182601,1875.572443
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,4.277288,0.216257,0.545481,0.2298642,1857.12642


In [101]:
tr_m[henry_df.columns[:5]][:5]

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_77K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,,,,,1672.743597
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,73.595342,0.303794,,,
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,,0.271928,,,
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.172201,0.200809,,,
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,,,,,
