# Lets create a Recommendation system for the Henry coefficients of porous materials

In [102]:
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pprint
import os
import pygaps
import scipy
from scipy.sparse.linalg import svds

## We will use the Henry Coefficient matrix, created in `HenryMatrix.ipynb`

In [103]:
henry_df = pd.read_csv('data/henry_matrix_median.csv')
henry_df.set_index(list(henry_df)[0], inplace=True)
henry_df.head(10)

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_77K,CURLTUGMZLYLDI-UHFFFAOYSA-N_195K,UFHFLCQGNIYNRP-UHFFFAOYSA-N_77K,QQONPFPTGQHPMA-UHFFFAOYSA-N_298K,ATUOYWHBWRKTHZ-UHFFFAOYSA-N_298K,HSFWRNGVRCDJHI-UHFFFAOYSA-N_298K,...,WEVYAHXRMPXWCK-UHFFFAOYSA-N_298K,YXFVVABEGXRONW-UHFFFAOYSA-N_298K,RWSOTUBLDIXVET-UHFFFAOYSA-N_298K,OFBQJSOFQDEBGM-UHFFFAOYSA-N_298K,LRHPLDYGYMQRHN-UHFFFAOYSA-N_298K,URLKBWYHVLBVBO-UHFFFAOYSA-N_298K,RAHZWNYVWXNFOC-UHFFFAOYSA-N_298K,SWQJXJOGLNCZEY-UHFFFAOYSA-N_298K,VXNZUUAINFGPBY-UHFFFAOYSA-N_298K,QWTDNUCVQCZILF-UHFFFAOYSA-N_298K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,0.643047,0.07743846,0.239119,,1672.743597,,160.335019,13.210848,18.987584,0.001506,...,,,,,,184.515114,65.502827,,,
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,73.595342,0.3037941,0.770303,0.0910124,3379.72946,,,52.095266,80.817935,,...,,,,,,,,0.000432,96.786201,
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,1.76034,0.2719285,0.32673,2e-10,1227.679975,91.53744,1208.198529,,,,...,,,12.899584,,,,,,,
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.172201,0.2008086,0.690172,0.182601,1875.572443,,112.247437,,,,...,,,,,,2545.488306,,,,
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,4.277288,0.2162572,0.545481,0.2298642,1857.12642,2e-10,1000.396903,638.464909,48.655815,86.774658,...,,,,5039.506517,,,94.944295,,,4128.514098
NIST-MATDB-9f02824618e94d86caedc9fb497366ba,0.226441,,0.00367,,1184.815284,,850.010218,,,5.888601,...,,,,,,,,,,
NIST-MATDB-acbbe796d49416e999dc819f2ae65fe9,3.040278,,14.129264,,2770.241788,,1249.618229,,,51.060141,...,,,,,,,57.137198,,,
NIST-MATDB-e23e0f3921a269533d74caae8c5a12bf,2.021117,2e-10,33.817715,,982.709651,0.3430361,1963.214804,,,,...,,,,1.402827,,,,,,
NIST-MATDB-9d023a670a2aedb5225d9cd365e24210,2.220104,,0.551645,,18491.161118,25.79633,25.17785,,,,...,,,,,,,17.700403,,,
NIST-MATDB-612dffd0588cbe4108374b80475f83a1,0.254916,,,,4114.442681,13.67906,18.092799,,,,...,,,,,,,,,,


## Lets also define a function that will randomly split our data into a testing and training set

In [98]:
# Will create a mask that will split the sparse matrix into two matrices. 
#  `train_frac` is the fraction of the data (excluding the NaN's) that will be used for training.
#  the fraction of the data used for testing will be `1 - train_frac`.
#
# train_matrix, test_matrix = split_matrix(henry_matrix, 0.8)
def split_matrix(henry_matrix, train_frac):
    matrix_size = np.size(henry_matrix)
    matrix_nan_size = np.sum(np.sum(np.isnan(henry_matrix)))
    henry_data_size = matrix_size - matrix_nan_size
    train_mask = np.zeros(henry_matrix.shape, dtype=bool)
    n_retained = np.floor(train_frac * henry_data_size)
    n_current = 0
    while True:
        i_material = random.choice(np.arange(len(henry_matrix.index)))
        j_gas = random.choice(np.arange(len(henry_matrix.columns)))
        if not np.isnan(henry_matrix.iat[i_material, j_gas]) and not train_mask[i_material, j_gas]:
            train_mask[i_material, j_gas] = True
            n_current += 1
        if n_current >= n_retained:
            break
    train_matrix = henry_matrix.mask(train_mask)
    test_matrix = henry_matrix.mask(~train_mask)
    #print(henry_data_size)
    #print(np.sum(np.sum(~np.isnan(train_matrix))) + np.sum(np.sum(~np.isnan(test_matrix))))
    assert henry_data_size == np.sum(np.sum(~np.isnan(train_matrix))) + np.sum(np.sum(~np.isnan(test_matrix)))
    return train_matrix, test_matrix
tr_m, te_m = split_matrix(henry_df, 0.8)

In [99]:
np.sum(np.sum(np.isnan(tr_m)))

11144

In [104]:
henry_df.head(10)

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_77K,CURLTUGMZLYLDI-UHFFFAOYSA-N_195K,UFHFLCQGNIYNRP-UHFFFAOYSA-N_77K,QQONPFPTGQHPMA-UHFFFAOYSA-N_298K,ATUOYWHBWRKTHZ-UHFFFAOYSA-N_298K,HSFWRNGVRCDJHI-UHFFFAOYSA-N_298K,...,WEVYAHXRMPXWCK-UHFFFAOYSA-N_298K,YXFVVABEGXRONW-UHFFFAOYSA-N_298K,RWSOTUBLDIXVET-UHFFFAOYSA-N_298K,OFBQJSOFQDEBGM-UHFFFAOYSA-N_298K,LRHPLDYGYMQRHN-UHFFFAOYSA-N_298K,URLKBWYHVLBVBO-UHFFFAOYSA-N_298K,RAHZWNYVWXNFOC-UHFFFAOYSA-N_298K,SWQJXJOGLNCZEY-UHFFFAOYSA-N_298K,VXNZUUAINFGPBY-UHFFFAOYSA-N_298K,QWTDNUCVQCZILF-UHFFFAOYSA-N_298K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,0.643047,0.07743846,0.239119,,1672.743597,,160.335019,13.210848,18.987584,0.001506,...,,,,,,184.515114,65.502827,,,
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,73.595342,0.3037941,0.770303,0.0910124,3379.72946,,,52.095266,80.817935,,...,,,,,,,,0.000432,96.786201,
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,1.76034,0.2719285,0.32673,2e-10,1227.679975,91.53744,1208.198529,,,,...,,,12.899584,,,,,,,
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.172201,0.2008086,0.690172,0.182601,1875.572443,,112.247437,,,,...,,,,,,2545.488306,,,,
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,4.277288,0.2162572,0.545481,0.2298642,1857.12642,2e-10,1000.396903,638.464909,48.655815,86.774658,...,,,,5039.506517,,,94.944295,,,4128.514098
NIST-MATDB-9f02824618e94d86caedc9fb497366ba,0.226441,,0.00367,,1184.815284,,850.010218,,,5.888601,...,,,,,,,,,,
NIST-MATDB-acbbe796d49416e999dc819f2ae65fe9,3.040278,,14.129264,,2770.241788,,1249.618229,,,51.060141,...,,,,,,,57.137198,,,
NIST-MATDB-e23e0f3921a269533d74caae8c5a12bf,2.021117,2e-10,33.817715,,982.709651,0.3430361,1963.214804,,,,...,,,,1.402827,,,,,,
NIST-MATDB-9d023a670a2aedb5225d9cd365e24210,2.220104,,0.551645,,18491.161118,25.79633,25.17785,,,,...,,,,,,,17.700403,,,
NIST-MATDB-612dffd0588cbe4108374b80475f83a1,0.254916,,,,4114.442681,13.67906,18.092799,,,,...,,,,,,,,,,


In [105]:
tr_m.head(10)

Unnamed: 0_level_0,CURLTUGMZLYLDI-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_298K,VNWKTOKETHGBQD-UHFFFAOYSA-N_298K,MYMOFIZGZYHOMD-UHFFFAOYSA-N_298K,IJGRMHOSHXDMSA-UHFFFAOYSA-N_77K,CURLTUGMZLYLDI-UHFFFAOYSA-N_195K,UFHFLCQGNIYNRP-UHFFFAOYSA-N_77K,QQONPFPTGQHPMA-UHFFFAOYSA-N_298K,ATUOYWHBWRKTHZ-UHFFFAOYSA-N_298K,HSFWRNGVRCDJHI-UHFFFAOYSA-N_298K,...,WEVYAHXRMPXWCK-UHFFFAOYSA-N_298K,YXFVVABEGXRONW-UHFFFAOYSA-N_298K,RWSOTUBLDIXVET-UHFFFAOYSA-N_298K,OFBQJSOFQDEBGM-UHFFFAOYSA-N_298K,LRHPLDYGYMQRHN-UHFFFAOYSA-N_298K,URLKBWYHVLBVBO-UHFFFAOYSA-N_298K,RAHZWNYVWXNFOC-UHFFFAOYSA-N_298K,SWQJXJOGLNCZEY-UHFFFAOYSA-N_298K,VXNZUUAINFGPBY-UHFFFAOYSA-N_298K,QWTDNUCVQCZILF-UHFFFAOYSA-N_298K
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NIST-MATDB-b2dc02871a66ac38e4b195796cf3e00b,,,,,1672.743597,,,,18.987584,,...,,,,,,,65.502827,,,
NIST-MATDB-035dc75dddd00241bd76627f78cbef2d,73.595342,0.303794,,,,,,52.095266,,,...,,,,,,,,0.000432,96.786201,
NIST-MATDB-453276ee992ed07dc207ed08b2b221ce,,0.271928,,,,,1208.198529,,,,...,,,,,,,,,,
NIST-MATDB-1b85b1862d4e9cc2e0f46d0e2698ab79,2.172201,0.200809,,,,,,,,,...,,,,,,,,,,
NIST-MATDB-991daf7313251e7e607e2bab2da57e33,,,,,,,1000.396903,,,,...,,,,,,,,,,
NIST-MATDB-9f02824618e94d86caedc9fb497366ba,,,,,1184.815284,,,,,,...,,,,,,,,,,
NIST-MATDB-acbbe796d49416e999dc819f2ae65fe9,,,,,,,,,,,...,,,,,,,,,,
NIST-MATDB-e23e0f3921a269533d74caae8c5a12bf,2.021117,,33.817715,,,,,,,,...,,,,,,,,,,
NIST-MATDB-9d023a670a2aedb5225d9cd365e24210,,,,,,,,,,,...,,,,,,,,,,
NIST-MATDB-612dffd0588cbe4108374b80475f83a1,,,,,,,,,,,...,,,,,,,,,,
