<a href="https://colab.research.google.com/github/SRI-CSL/CoProver/blob/main/src/notebooks/220629_metitarski/coprover_metitarski_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CoProver 

## &#10024; `MetiTarski`- problem

**Description:** Updated notebook to process v3 and v4 versions of MetiTarski data.

**Copyright 2022 SRI International.**

## &#9776; Import `needed` libraries

In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import rc
from tqdm import tqdm

In [2]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
try:
    from google.colab import data_table
    data_table.disable_dataframe_formatter()
    
    from google.colab import output
    output.enable_custom_widget_manager()
except Exception:
    print("Launched notebook locally")

In [4]:
from typing import List, Any, Dict

In [5]:
# install gdown library for .csv files download
try:
    import gdown
except ImportError:
    !pip install gdown

## &#9997; Set `needed` configuration

In [6]:
warnings.filterwarnings('ignore')

In [7]:
# origins of the .csv data files used
# True: originating within the signal-public GitHub repository
# False: need to be downloaded from a provided URL, especially useful if running on Colab
IS_LOCAL_FILE = False

In [8]:
# dictionary of files for this notebook to work
# the dictionary is composed of (filename, Google ID) key-value pairs

# v3: https://drive.google.com/file/d/1uC0WDg7fyZxwpc9UIgJznDAgT5WPqtA9/view?usp=sharing
# v4: https://drive.google.com/file/d/1uIoGOoHPsugXszScyU4HS9RKznX6tFeO/view?usp=sharing
DATASET_DICT = {
    'metitarski_dataset_v3.csv': '1uC0WDg7fyZxwpc9UIgJznDAgT5WPqtA9',
    'metitarski_dataset_v4.csv': '1uIoGOoHPsugXszScyU4HS9RKznX6tFeO'
    }

## &#9881; Define `needed` functions

In [9]:
def path_exists(input_path: str) -> bool:
    return os.path.exists(input_path)

In [10]:
def check_file_status(input_path: str):
    if path_exists(input_path=input_path):
        print(f"- File {input_path.split('/')[-1]} exists locally at {input_path}!")
    else:
        if IS_LOCAL_FILE:
            print("- Something went wrong with the download. Please try again!")
        else:
            print(f"- IS_LOCAL_FILE is set to {IS_LOCAL_FILE}. The file is accessed via a public GitHub link!")

In [11]:
def download_dataset_from_google_drive(google_file_id: str, output_file_name: str, quiet_download: bool) -> str:
    file_path = f'./{output_file_name}'

    if not os.path.exists(file_path):
        gdown.download(id=google_file_id, output=output_file_name, quiet=quiet_download)
    else:
        print(f"{output_file_name} already exists!")
        
    return file_path

In [12]:
def get_dataset(dataset_name: str, is_local_file: bool) -> str:
    file_path = f'./{dataset_name}'

    if is_local_file:
        file_path = f'https://raw.githubusercontent.com/SRI-CSL/CoProver/main/data/{dataset_name}'
    else:
        if dataset_name in DATASET_DICT:
            file_path = download_dataset_from_google_drive(google_file_id=DATASET_DICT[dataset_name], output_file_name=dataset_name, quiet_download=False)
        else:
            print(f"{dataset_name} is not present in dataset dictionary! Please ensure the file name is correct!")
            return

    return file_path

## &#9749; Download datasets

In [13]:
# metitarski_dataset_v1.csv
path_metitarski_original = get_dataset(dataset_name='metitarski_dataset_v3.csv', is_local_file=IS_LOCAL_FILE)

metitarski_dataset_v3.csv already exists!


In [14]:
check_file_status(input_path=path_metitarski_original)

- File metitarski_dataset_v3.csv exists locally at ./metitarski_dataset_v3.csv!


In [15]:
# metitarski_dataset_v4.csv
path_metitarski = get_dataset(dataset_name='metitarski_dataset_v4.csv', is_local_file=IS_LOCAL_FILE)

metitarski_dataset_v4.csv already exists!


In [16]:
check_file_status(input_path=path_metitarski)

- File metitarski_dataset_v4.csv exists locally at ./metitarski_dataset_v4.csv!


## &#128722; Load data

In [17]:
df_metitarski = pd.read_csv(path_metitarski, sep='\t')

In [18]:
df_metitarski.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41369 entries, 0 to 41368
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        41369 non-null  int64  
 1   file_id           41369 non-null  int64  
 2   input_file        41369 non-null  object 
 3   label_file        41369 non-null  object 
 4   nr_polynomials    41369 non-null  int64  
 5   max_total_degree  41369 non-null  int64  
 6   max_x1            41369 non-null  int64  
 7   max_x2            41369 non-null  int64  
 8   max_x3            41369 non-null  int64  
 9   prop_x1           41369 non-null  float64
 10  prop_x2           41369 non-null  float64
 11  prop_x3           41369 non-null  float64
 12  prop_mon_x1       41369 non-null  float64
 13  prop_mon_x2       41369 non-null  float64
 14  prop_mon_x3       41369 non-null  float64
 15  label             41369 non-null  int64  
dtypes: float64(6), int64(8), object(2)
memor

In [19]:
df_metitarski.head()

Unnamed: 0.1,Unnamed: 0,file_id,input_file,label_file,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3,label
0,0,415,poly415-perm0.txt.ml,comp_times415-perm0.txt,10,2,2,2,1,0.5,0.5,0.5,0.25,0.25,0.25,4
1,1,2230,poly2230-perm2.txt.ml,comp_times2230-perm2.txt,6,4,1,2,2,0.333333,0.666667,0.5,0.333333,0.47619,0.428571,0
2,2,6506,poly6506-perm3.txt.ml,comp_times6506-perm3.txt,6,16,16,1,1,0.5,0.333333,0.5,0.5,0.1,0.15,5
3,3,3998,poly3998-perm5.txt.ml,comp_times3998-perm5.txt,9,3,3,3,3,0.555556,0.555556,0.555556,0.35,0.35,0.35,1
4,4,3730,poly3730-perm2.txt.ml,comp_times3730-perm2.txt,14,9,1,9,3,0.214286,0.785714,0.142857,0.166667,0.611111,0.111111,3


In [20]:
df_metitarski.label.unique()

array([4, 0, 5, 1, 3, 2])

In [21]:
df_metitarski_original = pd.read_csv(path_metitarski_original, sep='\t')

In [22]:
df_metitarski_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6895 entries, 0 to 6894
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        6895 non-null   int64  
 1   file_id           6895 non-null   int64  
 2   input_file        6895 non-null   object 
 3   label_file        6895 non-null   object 
 4   nr_polynomials    6895 non-null   int64  
 5   max_total_degree  6895 non-null   int64  
 6   max_x1            6895 non-null   int64  
 7   max_x2            6895 non-null   int64  
 8   max_x3            6895 non-null   int64  
 9   prop_x1           6895 non-null   float64
 10  prop_x2           6895 non-null   float64
 11  prop_x3           6895 non-null   float64
 12  prop_mon_x1       6895 non-null   float64
 13  prop_mon_x2       6895 non-null   float64
 14  prop_mon_x3       6895 non-null   float64
 15  label             6895 non-null   int64  
dtypes: float64(6), int64(8), object(2)
memory 

In [23]:
df_metitarski_original.head()

Unnamed: 0.1,Unnamed: 0,file_id,input_file,label_file,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3,label
0,0,3940,poly3940.txt.ml,comp_times3940.txt,4,1,1,1,1,0.5,0.5,0.25,0.4,0.4,0.2,0
1,1,5554,poly5554.txt.ml,comp_times5554.txt,12,10,10,9,1,0.666667,0.666667,0.25,0.380952,0.52381,0.071429,4
2,2,4063,poly4063.txt.ml,comp_times4063.txt,9,1,1,1,1,0.444444,0.444444,0.555556,0.181818,0.181818,0.227273,5
3,3,4732,poly4732.txt.ml,comp_times4732.txt,7,8,4,2,1,0.428571,0.285714,0.428571,0.285714,0.142857,0.214286,2
4,4,5205,poly5205.txt.ml,comp_times5205.txt,6,18,12,6,1,0.5,0.333333,0.5,0.55,0.55,0.15,5


In [24]:
df_metitarski_original.label.unique()

array([0, 4, 5, 2, 1, 3])

## &#129504; MetiTarski RTF

In [25]:
FEATURE_COLUMNS = ['nr_polynomials', 'max_total_degree', 'max_x1', 'max_x2', 'max_x3', 'prop_x1', 'prop_x2', 'prop_x3', 'prop_mon_x1', 'prop_mon_x2', 'prop_mon_x3']

FEATURE_COLUMNS

['nr_polynomials',
 'max_total_degree',
 'max_x1',
 'max_x2',
 'max_x3',
 'prop_x1',
 'prop_x2',
 'prop_x3',
 'prop_mon_x1',
 'prop_mon_x2',
 'prop_mon_x3']

In [26]:
def training_set_scaler(input_df: pd.DataFrame):
    scaler = StandardScaler()
    scaler = scaler.fit(input_df)

    return scaler

In [27]:
def scale_data(input_df: pd.DataFrame, scaler):
    df_scaled = pd.DataFrame(scaler.transform(input_df), index=input_df.index, columns=input_df.columns)

    return df_scaled

### D1: Original MetiTarski Data

In [28]:
# original metitarski dataset features
df_features_original = df_metitarski_original[FEATURE_COLUMNS].copy().reset_index()

In [29]:
df_features_original.drop(['index'], axis=1, inplace=True)

In [30]:
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(df_features_original, df_metitarski_original.label, test_size=0.1)
X_train_original.shape, X_test_original.shape, y_train_original.shape, y_test_original.shape

((6205, 11), (690, 11), (6205,), (690,))

In [31]:
# scale original training set
scaler_original = training_set_scaler(input_df=X_train_original)

In [32]:
# rescale data
X_train_D1 = scale_data(input_df=X_train_original, scaler=scaler_original)
X_train_D1.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
5719,0.660058,0.225138,0.044509,-0.130596,-0.441678,0.236028,1.322997,0.00492,1.502599,1.478539,1.770495
3751,-0.537591,1.513419,1.647402,-0.608678,-0.441678,-0.40884,-0.095131,-0.46523,0.563515,-1.086798,-0.987013
2028,-1.136415,-0.741073,-0.596648,-0.130596,-0.441678,-0.40884,-0.095131,0.710144,-1.349434,-0.570844,0.208895
1923,-0.537591,-0.741073,-0.596648,-0.608678,-0.441678,-0.40884,-0.095131,-0.46523,-1.290574,-0.509785,-0.460145
5292,0.360646,-0.580038,-0.756938,-0.608678,-0.441678,1.38246,-0.489055,-0.46523,0.114387,0.015859,0.161626


In [33]:
X_test_D1 = scale_data(input_df=X_test_original, scaler=scaler_original)
X_test_D1.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
5306,-1.435827,-0.902108,-0.756938,-0.608678,-0.441678,0.66594,-1.276904,-0.46523,-0.125147,-0.7296,-0.008543
6001,-0.238179,0.225138,-0.27607,0.825567,-0.441678,-0.86946,-0.601605,1.213876,0.019834,0.097598,0.094454
2820,-1.136415,2.8017,2.929717,-0.608678,-0.441678,-0.40884,-0.095131,-1.052917,2.093874,-1.36462,-1.421889
1036,-0.537591,1.513419,1.647402,-0.608678,-0.441678,-0.40884,-0.095131,-0.46523,0.744741,-1.030399,-0.935515
2104,-0.238179,0.064103,0.044509,-0.130596,1.742078,0.05178,0.411343,1.213876,1.358124,1.138826,1.797865


### D2: New (Larger) MetiTarski Data

In [34]:
df_features = df_metitarski[FEATURE_COLUMNS].copy().reset_index()

In [35]:
df_features.head()

Unnamed: 0,index,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
0,0,10,2,2,2,1,0.5,0.5,0.5,0.25,0.25,0.25
1,1,6,4,1,2,2,0.333333,0.666667,0.5,0.333333,0.47619,0.428571
2,2,6,16,16,1,1,0.5,0.333333,0.5,0.5,0.1,0.15
3,3,9,3,3,3,3,0.555556,0.555556,0.555556,0.35,0.35,0.35
4,4,14,9,1,9,3,0.214286,0.785714,0.142857,0.166667,0.611111,0.111111


In [36]:
df_features.drop(['index'], axis=1, inplace=True)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_metitarski.label, test_size=0.1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37232, 11), (4137, 11), (37232,), (4137,))

In [38]:
# scale the new training set
scaler_new_dataset = training_set_scaler(input_df=X_train)

In [39]:
# rescale data
X_train_D2 = scale_data(input_df=X_train, scaler=scaler_new_dataset)
X_train_D2.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
31658,-0.533866,-0.584127,-0.017542,-0.483675,-0.019644,-0.979982,0.05051,0.047988,-1.03611,-0.584845,0.311387
3924,0.666991,-0.744384,-0.251803,-0.250312,-0.482845,0.05255,0.05051,0.047988,-0.1837,-0.184604,-0.188133
4229,1.567634,-0.103355,-0.017542,0.683143,-0.482845,0.767379,1.719449,-1.615424,0.167293,1.63649,-1.30706
31880,-1.134294,-0.423869,0.216719,-0.483675,-0.482845,0.05255,-1.499218,0.047988,0.167293,-1.235236,-0.537798
7182,0.366777,-0.744384,-0.486064,-0.250312,-0.251244,-0.291627,0.394895,0.391231,-0.608586,-0.276765,-0.280151


In [40]:
# rescale data
X_test_D2 = scale_data(input_df=X_test, scaler=scaler_new_dataset)
X_test_D2.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
23568,-1.134294,1.499219,-0.486064,-0.483675,2.991159,0.05255,-1.499218,0.047988,-1.03611,-1.485386,2.109663
11580,1.567634,0.858189,1.622286,-0.483675,2.064758,0.767379,-1.618428,1.236139,0.30769,-1.515404,1.420324
30702,-1.734722,-0.584127,-0.017542,-0.483675,-0.482845,3.150145,0.05051,0.047988,1.671546,-1.035115,-1.037319
24807,-0.533866,-0.584127,-0.017542,-0.483675,-0.019644,0.05255,0.05051,-0.981744,0.167293,-0.359709,-0.887463
15318,0.366777,-0.584127,-0.251803,-0.483675,-0.482845,1.773436,-0.293874,-0.981744,0.768994,-0.134574,-0.437894


### Scaling D2 on D1 scaler

In [41]:
X_train_D2_on_D1 = scale_data(input_df=X_train, scaler=scaler_original)
X_train_D2_on_D1.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
31658,-0.537591,-0.580038,-0.436359,-0.608678,3.925834,-1.48362,-0.095131,0.710144,-1.895991,-0.627543,1.529054
3924,0.660058,-0.741073,-0.596648,-0.130596,-0.441678,-0.40884,-0.095131,0.710144,-0.966845,-0.173957,0.75249
4229,1.558294,-0.096933,-0.436359,1.781731,-0.441678,0.335238,1.813887,-1.188537,-0.584255,1.889859,-0.987013
31880,-1.136415,-0.419003,-0.27607,-0.608678,-0.441678,-0.40884,-1.867791,0.710144,-0.584255,-1.36462,0.208895
7182,0.360646,-0.741073,-0.756938,-0.130596,1.742078,-0.7671,0.298793,1.101936,-1.42998,-0.278401,0.609439


In [42]:
X_test_D2_on_D1 = scale_data(input_df=X_test, scaler=scaler_original)
X_test_D2_on_D1.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
23568,-1.136415,1.513419,-0.756938,-0.608678,32.314664,-0.40884,-1.867791,0.710144,-1.895991,-1.648111,4.324685
11580,1.558294,0.869278,0.685666,-0.608678,23.579639,0.335238,-2.004149,2.066345,-0.431219,-1.68213,3.253026
30702,-1.735239,-0.580038,-0.436359,-0.608678,-0.441678,2.8155,-0.095131,0.710144,1.055416,-1.137827,-0.567669
24807,-0.537591,-0.580038,-0.436359,-0.608678,3.925834,-0.40884,-0.095131,-0.46523,-0.584255,-0.372401,-0.3347
15318,0.360646,-0.580038,-0.596648,-0.608678,-0.441678,1.38246,-0.489055,-0.46523,0.071613,-0.117259,0.364208


### Scaling D1 on D2 scaler

In [43]:
X_train_D1_on_D2 = scale_data(input_df=X_train_original, scaler=scaler_new_dataset)
X_train_D1_on_D2.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
5719,0.666991,0.21716,0.685241,-0.250312,-0.482845,0.672069,1.290294,-0.569851,2.081797,1.273545,0.466693
3751,-0.533866,1.499219,3.027852,-0.483675,-0.482845,0.05255,0.05051,-0.981744,1.22027,-0.990088,-1.30706
2028,-1.134294,-0.744384,-0.251803,-0.250312,-0.482845,0.05255,0.05051,0.047988,-0.534692,-0.534815,-0.537798
1923,-0.533866,-0.744384,-0.251803,-0.483675,-0.482845,0.05255,0.05051,-0.981744,-0.480694,-0.480936,-0.968155
5292,0.366777,-0.584127,-0.486064,-0.483675,-0.482845,1.773436,-0.293874,-0.981744,0.808235,-0.017112,-0.568204


In [44]:
X_test_D1_on_D2 = scale_data(input_df=X_test_original, scaler=scaler_new_dataset)
X_test_D1_on_D2.head()

Unnamed: 0,nr_polynomials,max_total_degree,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
5306,-1.434508,-0.904641,-0.486064,-0.483675,-0.482845,1.085082,-0.982642,-0.981744,0.588484,-0.674899,-0.677664
6001,-0.233651,0.21716,0.216719,0.216415,-0.482845,-0.389964,-0.392269,0.489301,0.721491,0.055014,-0.611412
2820,-1.134294,2.781277,4.901941,-0.483675,-0.482845,0.05255,0.05051,-1.496609,2.62424,-1.235236,-1.586792
1036,-0.533866,1.499219,3.027852,-0.483675,-0.482845,0.05255,0.05051,-0.981744,1.38653,-0.940322,-1.273934
2104,-0.233651,0.056903,0.685241,-0.250312,-0.251244,0.495063,0.49329,0.489301,1.949254,0.973784,0.484299


### Training and Testing on Original Metitarski Data (D1)

#### SVM

In [45]:
D1_svm = svm.SVC(C=316, kernel='rbf', gamma=0.08, tol=0.0316)

D1_svm.fit(X_train_D1, y_train_original)

SVC(C=316, gamma=0.08, tol=0.0316)

In [46]:
D1_svm_score = D1_svm.score(X_test_D1, y_test_original)
D1_svm_score

0.5840579710144927

In [47]:
D1_svm_score_D2_data = D1_svm.score(X_test_D2_on_D1, y_test)
D1_svm_score_D2_data

0.30215131737974377

#### K-NN

In [48]:
D1_knn = KNeighborsClassifier(weights='distance', algorithm='ball_tree')

D1_knn.fit(X_train_D1, y_train_original)

KNeighborsClassifier(algorithm='ball_tree', weights='distance')

In [49]:
D1_knn_score = D1_knn.score(X_test_D1, y_test_original)
D1_knn_score

0.5869565217391305

In [50]:
D1_knn_score_D2_data = D1_knn.score(X_test_D2_on_D1, y_test)
D1_knn_score_D2_data

0.3287406333091612

#### Decision Trees

In [51]:
D1_dt = DecisionTreeClassifier(max_depth=17)

D1_dt.fit(X_train_D1, y_train_original)

DecisionTreeClassifier(max_depth=17)

In [52]:
D1_dt_score = D1_dt.score(X_test_D1, y_test_original)
D1_dt_score

0.5855072463768116

In [53]:
D1_knn_score_D2_data = D1_dt.score(X_test_D2_on_D1, y_test)
D1_knn_score_D2_data

0.29804205946337925

#### RF

In [54]:
D1_rf = RandomForestClassifier()
D1_rf.fit(X_train_D1, y_train_original)

RandomForestClassifier()

In [55]:
D1_rf_score = D1_rf.score(X_test_D1, y_test_original)
D1_rf_score

0.5971014492753624

In [56]:
D1_rf_score_D2_data = D1_rf.score(X_test_D2_on_D1, y_test)
D1_rf_score_D2_data

0.32366449117718155

#### MLP

In [57]:
D1_mlp = MLPClassifier(hidden_layer_sizes=(18,), activation='tanh', solver='lbfgs', alpha=0.00005)

D1_mlp.fit(X_train_D1, y_train_original)

MLPClassifier(activation='tanh', alpha=5e-05, hidden_layer_sizes=(18,),
              solver='lbfgs')

In [58]:
D1_mlp_score = D1_mlp.score(X_test_D1, y_test_original)
D1_mlp_score

0.5492753623188406

In [59]:
D1_mlp_score_D2_data = D1_mlp.score(X_test_D2_on_D1, y_test)
D1_mlp_score_D2_data

0.3135122069132221

### Training and Testing on New Metitarski Data (D2)

#### SVM

In [60]:
D2_svm = svm.SVC(C=316, kernel='rbf', gamma=0.08, tol=0.0316)

D2_svm.fit(X_train_D2, y_train)

SVC(C=316, gamma=0.08, tol=0.0316)

In [61]:
D2_svm_score = D2_svm.score(X_test_D2, y_test)
D2_svm_score

0.5644186608653614

In [62]:
D2_svm_score_D1_data = D2_svm.score(X_test_D1_on_D2, y_test_original)
D2_svm_score_D1_data

0.6275362318840579

#### K-NN

In [63]:
D2_knn = KNeighborsClassifier(weights='distance', algorithm='ball_tree')

D2_knn.fit(X_train_D2, y_train)

KNeighborsClassifier(algorithm='ball_tree', weights='distance')

In [64]:
D2_knn_score = D2_knn.score(X_test_D2, y_test)
D2_knn_score

0.5484650713077109

In [65]:
D2_knn_score_D1_data = D2_knn.score(X_test_D1_on_D2, y_test_original)
D2_knn_score_D1_data

0.7

#### Decision Trees

In [66]:
D2_dt = DecisionTreeClassifier(max_depth=17)

D2_dt.fit(X_train_D2, y_train)

DecisionTreeClassifier(max_depth=17)

In [67]:
D2_dt_score = D2_dt.score(X_test_D2, y_test)
D2_dt_score

0.544839255499154

In [68]:
D2_knn_score_D1_data = D2_dt.score(X_test_D1_on_D2, y_test_original)
D2_knn_score_D1_data

0.6623188405797101

#### RF

In [69]:
D2_rf = RandomForestClassifier()
D2_rf.fit(X_train_D2, y_train)

RandomForestClassifier()

In [70]:
D2_rf_score = D2_rf.score(X_test_D2, y_test)
D2_rf_score

0.5419386028523084

In [71]:
D2_rf_score_D1_data = D2_rf.score(X_test_D1_on_D2, y_test_original)
D2_rf_score_D1_data

0.6927536231884058

#### MLP

In [72]:
D2_mlp = MLPClassifier(hidden_layer_sizes=(18,), activation='tanh', solver='lbfgs', alpha=0.00005)

D2_mlp.fit(X_train_D2, y_train)

MLPClassifier(activation='tanh', alpha=5e-05, hidden_layer_sizes=(18,),
              solver='lbfgs')

In [73]:
D2_mlp_score = D2_mlp.score(X_test_D2, y_test)
D2_mlp_score

0.4960116026105874

In [74]:
D2_mlp_score_D1_data = D2_mlp.score(X_test_D1_on_D2, y_test_original)
D2_mlp_score_D1_data

0.49420289855072463

## &#128218; References

1. SVC, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
2. K-NN, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
3. Decision Tree, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
4. MLP, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)
5. RF, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)