In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from time import time
from typing import List
from sklearn.model_selection import train_test_split
import pandas as pd
from typing import List
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/train.csv')
submission_test_data = pd.read_csv('data/test.csv')

In [3]:
X_features = ["action_recommendation_id", 
              "action_recommendation_type", "action_recommendation_category", 
              "equipment_area", "usage_type", "speed_category",
             "load_category", "floors_category", "equipment_category"]

train_data = data[data.feedback == 1]
test_data_all = data[data.feedback == 0]

train_data = train_data.loc[:, X_features]
test_data = test_data_all.loc[:, X_features]

In [4]:
# Must be strings
def to_string(data):
    data.speed_category = data.speed_category.astype(str)
    data.load_category = data.load_category.astype(str)
    data.floors_category = data.floors_category.astype(str)
    return data

train_data = to_string(train_data)
test_data = to_string(test_data)

In [5]:
from scipy import sparse
def convert_to_sparse(train_df : pd.DataFrame, test_df : pd.DataFrame):
    sparse_train = pd.get_dummies(train_df)
    sparse_test = pd.get_dummies(test_df)
    
    def impute_columns(df1, df2):
        missing_cols = set(df1.columns) - set(df2.columns)
        for c in missing_cols:
            df2[c] = 0
        df2 = df2[sparse_train.columns]
        return df1, df2
    
    sparse_train, sparse_test = impute_columns(sparse_train, sparse_test)
    sparse_test, sparse_train = impute_columns(sparse_test, sparse_train)
    
    sparse_knn_train = sparse.csr_matrix(sparse_train.values)
    sparse_knn_test = sparse.csr_matrix(sparse_test.values)
    return sparse_knn_train, sparse_knn_test

train_sparse, test_sparse = convert_to_sparse(train_data, test_data)

In [41]:
from sklearn.neighbors import NearestNeighbors

def find_closest_claim(case_id):
    neigh = NearestNeighbors(n_neighbors=5)
    neigh.fit(train_sparse)
    
    row_index = data[data.case_id == case_id]
    if len(row_index) == 0:
        return "empty", "empty"
    
    # We need to take out data filtering in to account
    index_to_find = test_data_all.loc[test_data_all.case_id == case_id, :]
    row_num = test_data_all.index.get_loc(index_to_find.index[0])
    
    sparse_sample = test_sparse[row_num, :]
    distance, nearest_indices = neigh.kneighbors(sparse_sample)
    
    return distance, nearest_indices

In [40]:
import ipywidgets
from IPython.display import display, Markdown, clear_output
# from IPython.core.display import HTML
from ipywidgets import HTML

BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'

##
# WIDGETS
##

box_layout = ipywidgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='100%')

out = ipywidgets.Output()
rnd_out = ipywidgets.Output()
output = ipywidgets.Output()
title = ipywidgets.Output()

btn_layout = ipywidgets.Layout(width='30%')
rnd_btn = ipywidgets.Button(description='Take random case from test', layout=btn_layout)
btn = ipywidgets.Button(description='Search similar cases', layout=btn_layout)
btn2 = ipywidgets.Button(description='Find the case', layout=btn_layout)

txt = ipywidgets.Text(description='case ID: ', layout=btn_layout)
txt2 = ipywidgets.Text(description='Case ID: ', layout=btn_layout)

html1 = HTML("""<h1><center><u>Master of Maintenance</u></center></h1><h3><center>Similarity search engine</center></h3>""")
hline = HTML("<hline>")
br = HTML("<br>")
html2 = HTML("""<br><h3><center>Case search engine</center></h3>""")

##
# FUNCTIONS
##
def generate_random(b):
    df = data[data.feedback == 0]
    random_row = data[data.feedback == 0].sample()
    random_case = str(random_row.case_id.values[0])
    
    with rnd_out:
        clear_output()
        txt.value = random_case

def on_button_clicked(b):
    with output:
        clear_output()
        distances, row_nums = find_closest_claim(txt.value)
        
        if distances == "empty":
            print("No match found!")
        else:
            print("Closest case numbers are: ")
            [print(f"{BOLD}{i+1}.{END}{data.iloc[num, 0]}") for i, num in enumerate(row_nums[0])]

def find_case(b):
    with out:
        clear_output()
        case_id = txt2.value
        case_df = data[data.case_id == case_id]
        if len(case_df) == 0:
            print("No match found!")
            return
        columns = case_df.columns.values
        values = case_df.values[0]
        
        [print(f"{BOLD}{col}:{END}{val}") for col, val in zip(columns, values)]

        
##
# LISTENERS
##
rnd_btn.on_click(generate_random)
btn.on_click(on_button_clicked)
btn2.on_click(find_case)

##
# RUN
##
box = ipywidgets.HBox(children=[html1,
                                txt, br, rnd_btn, rnd_out, btn, br, output, 
                                html2,
                                txt2, br, btn2, br, out], layout=box_layout)
display(box)

HBox(children=(HTML(value='<h1><center><u>Master of Maintenance</u></center></h1><h3><center>Similarity search…