<a href="https://colab.research.google.com/github/SRI-CSL/CoProver/blob/main/src/notebooks/220629_metitarski/coprover_metitarski.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CoProver 

## &#10024; `MetiTarski`- problem

**Copyright 2022 SRI International.**

## &#9776; Import `needed` libraries

In [None]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import rc
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [None]:
try:
    from google.colab import data_table
    data_table.disable_dataframe_formatter()
    
    from google.colab import output
    output.enable_custom_widget_manager()
except Exception:
    print("Launched notebook locally")

In [None]:
from typing import List, Any, Dict

In [None]:
# install gdown library for .csv files download
try:
    import gdown
except ImportError:
    !pip install gdown

## &#9997; Set `needed` configuration

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# origins of the .csv data files used
# True: originating within the signal-public GitHub repository
# False: need to be downloaded from a provided URL, especially useful if running on Colab
IS_LOCAL_FILE = False

In [None]:
# dictionary of files for this notebook to work
# the dictionary is composed of (filename, Google ID) key-value pairs
DATASET_DICT = {
    'metitarski_dataset_v1.csv': '1jOqdNyfTr_8rPtYCZ-4gNifOJWOCtjPj',
    'metitarski_dataset_v2.csv': '1RXJI-1ZKnKs6RhmPlGmM111XHzcfyCDy'
    }

## &#9881; Define `needed` functions

In [None]:
def path_exists(input_path: str) -> bool:
    return os.path.exists(input_path)

In [None]:
def check_file_status(input_path: str):
    if path_exists(input_path=input_path):
        print(f"- File {input_path.split('/')[-1]} exists locally at {input_path}!")
    else:
        if IS_LOCAL_FILE:
            print("- Something went wrong with the download. Please try again!")
        else:
            print(f"- IS_LOCAL_FILE is set to {IS_LOCAL_FILE}. The file is accessed via a public GitHub link!")

In [None]:
def download_dataset_from_google_drive(google_file_id: str, output_file_name: str, quiet_download: bool) -> str:
    file_path = f'./{output_file_name}'

    if not os.path.exists(file_path):
        gdown.download(id=google_file_id, output=output_file_name, quiet=quiet_download)
    else:
        print(f"{output_file_name} already exists!")
        
    return file_path

In [None]:
def get_dataset(dataset_name: str, is_local_file: bool) -> str:
    file_path = f'./{dataset_name}'

    if is_local_file:
        file_path = f'https://raw.githubusercontent.com/SRI-CSL/CoProver/main/data/{dataset_name}'
    else:
        if dataset_name in DATASET_DICT:
            file_path = download_dataset_from_google_drive(google_file_id=DATASET_DICT[dataset_name], output_file_name=dataset_name, quiet_download=False)
        else:
            print(f"{dataset_name} is not present in dataset dictionary! Please ensure the file name is correct!")
            return

    return file_path

## &#9749; Download datasets

In [None]:
# metitarski_dataset_v2.csv
path_metitarski = get_dataset(dataset_name='metitarski_dataset_v2.csv', is_local_file=IS_LOCAL_FILE)

Downloading...
From: https://drive.google.com/uc?id=1RXJI-1ZKnKs6RhmPlGmM111XHzcfyCDy
To: /content/metitarski_dataset_v2.csv
100%|██████████| 6.25M/6.25M [00:00<00:00, 66.8MB/s]


In [None]:
check_file_status(input_path=path_metitarski)

- File metitarski_dataset_v2.csv exists locally at ./metitarski_dataset_v2.csv!


In [None]:
# metitarski_dataset_v1.csv
path_metitarski_original = get_dataset(dataset_name='metitarski_dataset_v1.csv', is_local_file=IS_LOCAL_FILE)

Downloading...
From: https://drive.google.com/uc?id=1jOqdNyfTr_8rPtYCZ-4gNifOJWOCtjPj
To: /content/metitarski_dataset_v1.csv
100%|██████████| 953k/953k [00:00<00:00, 77.2MB/s]


In [None]:
check_file_status(input_path=path_metitarski_original)

- File metitarski_dataset_v1.csv exists locally at ./metitarski_dataset_v1.csv!


## &#128722; Load data

In [None]:
df_metitarski = pd.read_csv(path_metitarski, sep='\t')

In [None]:
df_metitarski.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41369 entries, 0 to 41368
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      41369 non-null  int64  
 1   file_id         41369 non-null  int64  
 2   input_file      41369 non-null  object 
 3   label_file      41369 non-null  object 
 4   nr_polynomials  41369 non-null  int64  
 5   max_x1          41369 non-null  int64  
 6   max_x2          41369 non-null  int64  
 7   max_x3          41369 non-null  int64  
 8   prop_x1         41369 non-null  float64
 9   prop_x2         41369 non-null  float64
 10  prop_x3         41369 non-null  float64
 11  prop_mon_x1     41369 non-null  float64
 12  prop_mon_x2     41369 non-null  float64
 13  prop_mon_x3     41369 non-null  float64
 14  label           41369 non-null  int64  
dtypes: float64(6), int64(7), object(2)
memory usage: 4.7+ MB


In [None]:
df_metitarski.head()

Unnamed: 0.1,Unnamed: 0,file_id,input_file,label_file,nr_polynomials,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3,label
0,0,415,poly415-perm0.txt.ml,comp_times415-perm0.txt,10,2,2,1,0.5,0.5,0.5,0.333333,0.333333,0.333333,4
1,1,2230,poly2230-perm2.txt.ml,comp_times2230-perm2.txt,6,1,2,2,0.333333,0.666667,0.5,0.269231,0.384615,0.346154,0
2,2,6506,poly6506-perm3.txt.ml,comp_times6506-perm3.txt,6,16,1,1,0.5,0.333333,0.5,0.666667,0.133333,0.2,5
3,3,3998,poly3998-perm5.txt.ml,comp_times3998-perm5.txt,9,3,3,3,0.555556,0.555556,0.555556,0.333333,0.333333,0.333333,1
4,4,3730,poly3730-perm2.txt.ml,comp_times3730-perm2.txt,14,1,9,3,0.214286,0.785714,0.142857,0.1875,0.6875,0.125,3


In [None]:
df_metitarski.label.unique()

array([4, 0, 5, 1, 3, 2])

In [None]:
df_metitarski_original = pd.read_csv(path_metitarski_original, sep='\t')

In [None]:
df_metitarski_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6895 entries, 0 to 6894
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      6895 non-null   int64  
 1   file_id         6895 non-null   int64  
 2   input_file      6895 non-null   object 
 3   label_file      6895 non-null   object 
 4   nr_polynomials  6895 non-null   int64  
 5   max_x1          6895 non-null   int64  
 6   max_x2          6895 non-null   int64  
 7   max_x3          6895 non-null   int64  
 8   prop_x1         6895 non-null   float64
 9   prop_x2         6895 non-null   float64
 10  prop_x3         6895 non-null   float64
 11  prop_mon_x1     6895 non-null   float64
 12  prop_mon_x2     6895 non-null   float64
 13  prop_mon_x3     6895 non-null   float64
 14  label           6895 non-null   int64  
dtypes: float64(6), int64(7), object(2)
memory usage: 808.1+ KB


In [None]:
df_metitarski_original.head()

Unnamed: 0.1,Unnamed: 0,file_id,input_file,label_file,nr_polynomials,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3,label
0,0,3940,poly3940.txt.ml,comp_times3940.txt,4,1,1,1,0.5,0.5,0.25,0.4,0.4,0.2,0
1,1,5554,poly5554.txt.ml,comp_times5554.txt,12,10,9,1,0.666667,0.666667,0.25,0.418605,0.511628,0.069767,4
2,2,4063,poly4063.txt.ml,comp_times4063.txt,9,1,1,1,0.444444,0.444444,0.555556,0.307692,0.307692,0.384615,5
3,3,4732,poly4732.txt.ml,comp_times4732.txt,7,4,2,1,0.428571,0.285714,0.428571,0.5,0.2,0.3,2
4,4,5205,poly5205.txt.ml,comp_times5205.txt,6,12,6,1,0.5,0.333333,0.5,0.44,0.44,0.12,5


In [None]:
df_metitarski_original.label.unique()

array([0, 4, 5, 2, 1, 3])

## &#129504; MetiTarski RTF

In [None]:
FEATURE_COLUMNS = ['nr_polynomials', 'max_x1', 'max_x2', 'max_x3', 'prop_x1', 'prop_x2', 'prop_x3', 'prop_mon_x1', 'prop_mon_x2', 'prop_mon_x3']

FEATURE_COLUMNS

['nr_polynomials',
 'max_x1',
 'max_x2',
 'max_x3',
 'prop_x1',
 'prop_x2',
 'prop_x3',
 'prop_mon_x1',
 'prop_mon_x2',
 'prop_mon_x3']

In [None]:
def rescale_data(df: pd.DataFrame) -> pd.DataFrame:
    """Rescale all features using MinMaxScaler() to same scale, between 0 and 1."""
    
    scaler = StandardScaler()
    scaler = scaler.fit(df)

    df_scaled = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)

    return df_scaled

In [None]:
df_features = df_metitarski[FEATURE_COLUMNS].copy().reset_index()

In [None]:
df_features.head()

Unnamed: 0,index,nr_polynomials,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
0,0,10,2,2,1,0.5,0.5,0.5,0.333333,0.333333,0.333333
1,1,6,1,2,2,0.333333,0.666667,0.5,0.269231,0.384615,0.346154
2,2,6,16,1,1,0.5,0.333333,0.5,0.666667,0.133333,0.2
3,3,9,3,3,3,0.555556,0.555556,0.555556,0.333333,0.333333,0.333333
4,4,14,1,9,3,0.214286,0.785714,0.142857,0.1875,0.6875,0.125


In [None]:
df_features.drop(['index'], axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_metitarski.label, test_size=0.1)

In [None]:
X_train = rescale_data(X_train)

In [None]:
X_train.head()

Unnamed: 0,nr_polynomials,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
6841,-1.43414,-0.485038,-0.485103,-0.483329,1.076331,1.083189,1.082436,-0.002511,0.002023,0.000489
29783,0.665416,-0.485038,1.151779,0.680362,-1.19223,0.670078,1.289016,-1.618603,1.12731,0.491841
6032,-0.234394,-0.485038,-0.251263,-0.483329,-0.39676,-0.392208,0.492209,-0.56463,0.002023,0.562035
2204,1.265289,0.215165,-0.017422,-0.250591,0.560749,1.599578,-0.466909,0.141009,0.145677,-0.286258
32826,-0.53433,-0.485038,-0.485103,-0.483329,-0.985997,1.083189,-0.983358,-0.56463,1.12731,-0.561057


In [None]:
X_test = rescale_data(X_test)

In [None]:
X_test.head()

Unnamed: 0,nr_polynomials,max_x1,max_x2,max_x3,prop_x1,prop_x2,prop_x3,prop_mon_x1,prop_mon_x2,prop_mon_x3
39260,1.564509,-0.473736,-0.473567,-0.488729,1.291472,-0.669567,0.759017,0.467489,-0.735087,0.276039
13994,-0.528659,-0.473736,1.094646,-0.488729,0.088895,0.041286,-0.977243,-0.382291,1.387281,-1.025423
38448,-0.229635,-0.473736,-0.473567,-0.488729,0.535566,0.481338,-0.390973,0.224695,0.1831,-0.41297
11926,-1.425731,-0.473736,0.198524,0.211365,1.131129,-0.985501,-0.977243,-0.534038,0.258362,0.276039
23564,-1.425731,-0.246138,0.198524,-0.488729,-0.953339,-0.985501,1.0747,-0.86788,0.423936,0.444464


### Original Metitarski Data

In [None]:
df_features_original = df_metitarski_original[FEATURE_COLUMNS].copy().reset_index()

In [None]:
df_features_original.drop(['index'], axis=1, inplace=True)

In [None]:
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(df_features_original, df_metitarski_original.label, test_size=0.1)

In [None]:
X_train_original = rescale_data(X_train_original)

In [None]:
X_test_original = rescale_data(X_test_original)

In [None]:
### SVM on Original Metitarski Data
clf_original = svm.SVC(C=316, kernel='rbf', gamma=0.08, tol=0.0316)

#Train the model using the training sets
clf_original.fit(X_train_original, y_train_original)

#Predict the response for test dataset
y_pred_original = clf_original.predict(X_test_original)

In [None]:
print("Accuracy on original test set:",metrics.accuracy_score(y_test_original, y_pred_original))

Accuracy on original test set: 0.5840579710144927


In [None]:
y_pred_train_original = clf_original.predict(X_train_original)

In [None]:
print("Accuracy on original train set:",metrics.accuracy_score(y_train_original, y_pred_train_original))

Accuracy on original train set: 0.6643029814665592


In [None]:
y_pred_new = clf_original.predict(X_test)

In [None]:
print("Accuracy on new test set:",metrics.accuracy_score(y_test, y_pred_new))

Accuracy on new test set: 0.20425429054870678


In [None]:
y_pred_train_new = clf_original.predict(X_train)

In [None]:
print("Accuracy on new train set:",metrics.accuracy_score(y_train, y_pred_train_new))

Accuracy on new train set: 0.20820799312419425


### Support Vector Classifier (SVC)

In [None]:
#Create a svm Classifier
# kernel: radial basis function
clf = svm.SVC(C=316, kernel='rbf', gamma=0.08, tol=0.0316)

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
y_pred

array([2, 4, 4, ..., 2, 2, 5])

In [None]:
y_test.value_counts()

0    928
1    733
2    687
3    654
5    570
4    565
Name: label, dtype: int64

In [None]:
y_train.value_counts()

0    7577
1    6664
3    6061
2    6044
4    5623
5    5263
Name: label, dtype: int64

In [None]:
y_test[0:5]

39260    2
13994    1
38448    0
11926    0
23564    5
Name: label, dtype: int64

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5450809765530578


### K-Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_clf = KNeighborsClassifier(weights='distance', algorithm='ball_tree')

In [None]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', weights='distance')

In [None]:
y_pred = knn_clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5523326081701716


### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc_clf = DecisionTreeClassifier(max_depth=17)

In [None]:
dtc_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=17)

In [None]:
y_pred = dtc_clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5242929659173314


In [None]:
score = dtc_clf.score(X_test, y_test)
score

0.5242929659173314

### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(18,), activation='tanh', solver='lbfgs', alpha=0.00005)

In [None]:
mlp_clf.fit(X_train, y_train)

MLPClassifier(activation='tanh', alpha=5e-05, hidden_layer_sizes=(18,),
              solver='lbfgs')

In [None]:
y_pred = mlp_clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4757070340826686


In [None]:
score = mlp_clf.score(X_test, y_test)
score

0.4757070340826686

### Random Forest (RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier()

In [None]:
y_pred = rf_clf.predict(X_test)

In [None]:
score = rf_clf.score(X_test, y_test)
score

0.5363790186125211

## &#128218; References

1. SVC, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
2. K-NN, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
3. Decision Tree, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
4. MLP, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)
5. RF, see [HERE](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)