In [1]:
# utilities
import pandas as pd
import jsonlines
import json
import pickle
from tqdm import tqdm

# model training
#for binary classification
from sklearn.neighbors import KNeighborsClassifier 
#for hyperparams tuning
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
#for the confusion matrix
from sklearn import metrics

#visualization 
import seaborn as sns
import matplotlib.pyplot as plt

Firstly convert the jsonl files into dataframe and save it into .csv file

In [None]:
#DEV SET OK
df = pd.read_json(r'emb_dev.jsonl')
export_csv = df.to_csv(r'emb_dev.csv', index = None, header=True)

In [None]:
# TEST SET OK
df1 = pd.read_json(r'emb_test.jsonl')
export_csv = df1.to_csv(r'emb_test.csv', index = None, header=True)

In [None]:
#TRAIN SET OK
df2 = pd.read_json(r'emb_train.jsonl')
export_csv = df2.to_csv(r'emb_train.csv', index = None, header=True)

In [None]:
# EXPORTING THE .CSV OF THE TRAIN SET
from google.colab import files
files.download("emb_train.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**TEST OF EVERYTHIN WITH SMALL DATA**

In [2]:
#load the data
dev_df = pd.read_csv('emb_dev.csv')#,nrows=100)

In [3]:
test_df = pd.read_csv('emb_test.csv')#,nrows=100)

In [4]:
train_df = pd.read_csv('emb_train.csv')#,nrows=100)

In [6]:
''' Method to modify the dataset taking only the usefull fields 'output' and 'claim_embedding' and convert the output's values into binary values for the classification
  Input: dataframe to modify
  Output: modified dataframe
'''
def modify_df(df):  
    #delete the useless columns
    del df['id']
    del df['input']

    dict_output = {}
    list_values = []
    #extract the items from the output column
    output = df['output']
    for item in output:
        #to convert the string of dict into dict
        dict_output= eval(item)
        # change values to represent labels as 0 ("REFUTES") and 1 ("SUPPORTS") and add them to a list
        for i in dict_output:
            if (dict_output[i] == 'REFUTES'):
                list_values.append(0)
            else:
                list_values.append(1)

    #create a new column of the df with the list of zeros and ones 
    df['labels'] = list_values
    #delete the output column
    del df['output']
    return df 

In [7]:
# obtain the dataset processed 
dev = modify_df(dev_df)

In [15]:
dev.head()

Unnamed: 0,claim_embedding,labels
0,"[0.0312394015491, -0.19459494948387102, -0.207...",1
1,"[0.0312394015491, -0.19459494948387102, -0.207...",0
2,"[0.0312394015491, -0.19459494948387102, -0.207...",0
3,"[0.0312394015491, -0.19459494948387102, -0.207...",1
4,"[0.0312394015491, -0.19459494948387102, -0.207...",1


In [8]:
# obtain the dataset processed 
train = modify_df(train_df)

In [17]:
train.head()

Unnamed: 0,claim_embedding,labels
0,"[-0.23010256886482203, 0.021919053047895, -0.0...",1
1,"[-0.23010256886482203, 0.021919053047895, -0.0...",1
2,"[-0.23010256886482203, 0.021919053047895, -0.0...",1
3,"[-0.23010256886482203, 0.021919053047895, -0.0...",0
4,"[-0.23010256886482203, 0.021919053047895, -0.0...",1


#### For training the classification model, firstly we split the datasets:
- train_set divided in:
    - x_train (embeddings vectors)  
    - y_train (label) 
- dev_set divided in:
    - x_dev (embeddings vectors), 
    - y_dev (label)
- test_set becames x_test (embeddings vectors)

In [9]:
x_train_emb = train['claim_embedding']
#convert string to list 
x_train = [n.strip('][').split(', ') for n in x_train_emb]

y_train = train.labels

In [10]:
x_dev_emb = dev['claim_embedding']
#convert string to list 
x_dev = [n.strip('][').split(', ') for n in x_dev_emb]

y_dev = dev.labels

In [11]:
test_set = test_df.claim_embedding
#convert string to list 
x_test = [n.strip('][').split(', ') for n in test_set]

### Tuning on the dev set
Function for tuning the hyperparameters. It returns the best params to use for the KNN

In [21]:
''' Method for tuning the hyperparameters
  Input: splitted dev set into x and y for the fit of the RandomizedSearchCV
  Output: best params to use for the KNN
'''
def tuning(x_dev,y_dev):
    # define the parameter values that should be searched
    k_range = list(range(1,30,4))
    weight_options = ['uniform', 'distance'] # distance: more weight to more similar values
    algo_options = ['auto', 'ball_tree', 'kd_tree', 'brute']
    distance_options = [1,2,3] # different types of distances (manhattan, euclidean, minkowksi)
    
    # save the "parameter grid"
    param_grid = dict(n_neighbors=k_range, weights=weight_options, algorithm =algo_options,  p=distance_options)
    print('Params grid: ',param_grid) #need this for the report

    #define the classification model chosen
    model = KNeighborsClassifier()
    rand = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=10, random_state=5, n_jobs=-1)
    rand.fit(x_dev, y_dev)
    rand.cv_results_
    
    # examine the best model
    print('Rand. Best Score: ', rand.best_score_)
    #save the optimize parameters
    best_param = rand.best_params_
    #return the tuning params for the model
    return best_param 

#### Now using the best parameter obtained by the tuning with the RandomizedSearchCV, we can train the train_set with the KNN

In [22]:
params = tuning(x_dev,y_dev) #dict of best parameters for the classifier

Params grid:  {'n_neighbors': [1, 5, 9, 13, 17, 21, 25, 29], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2, 3]}


  return f(*args, **kwargs)


Rand. Best Score:  0.5028724627633527


In [23]:
params

{'weights': 'uniform', 'p': 1, 'n_neighbors': 29, 'algorithm': 'kd_tree'}

TRAIN THE MODEL AND SAVE IT

In [25]:
''' Method for training the model using the KNeighborsClassifier() as binary classifier. After the training, the model is saved into a pickle file
  Input: dictionary of the tuned parameters, train set splitted in feature and target (x_train, y_train)
'''
def classifier(params, x_train, y_train):
    #train the model using the optimized params obtained in the tuning
    knn = KNeighborsClassifier(n_neighbors=params['n_neighbors'], weights=params['weights'], algorithm= params['algorithm'], p=params['p'])
    
    #fit the model
    knn.fit(x_train, y_train)
    
    # save the model to disk
    filename = 'KNN.sav'
    pickle.dump(knn, open(filename, 'wb'))

In [26]:
classifier(params, x_train, y_train)

  return f(*args, **kwargs)


MAKE CLASS PREDICTION ON  THE SAVED MODEL 

In [None]:
# load the trained model from disk
knn = pickle.load(open('KNN.sav', 'rb'))

# make class predictions for the dev set, we need this to evaluate the model
y_pred_class = knn.predict(x_dev)

  return f(*args, **kwargs)


EVALUATE THE MODEL

In [None]:
''' Function for the evaluation of the model. Using the metrics function from the library sklearn, here we compute the accuracy_score, the confusion_matrix and the precision and recall of the targets 'SUPPORTS', 'REFUTES'.
  Input: y_dev, predicted class
  Output: accuracy score,confusion matrix,precision score,recall score
'''
def evaluation(y_dev,y_pred_class):    
    # compute the accuracy 
    accuracy = metrics.accuracy_score(y_dev, y_pred_class)
    
    #build the confusion matrix and plot it
    confusion = metrics.confusion_matrix(y_dev, y_pred_class)
                #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    # visualize Confusion Matrix
    sns.heatmap(confusion,annot=True,fmt="d") 
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    # compute the precision and the recall on the label and print them
    target_names = ['SUPPORTS', 'REFUTES']
    print(metrics.classification_report(y_dev, y_pred_class, target_names=target_names))
    
    return accuracy,confusion

In [None]:
results = evaluation(y_dev,y_pred_class)

In [33]:
print('Accuracy value: ' ,results[0])
conf_matrix = results[1]

Accuracy value:  0.44


In [34]:
conf_matrix

array([[ 0, 56],
       [ 0, 44]])

**For the chosen classifier, get predictions for the official test set associated to the best hyperparameter configuration.**

In [35]:
# get predictions for the official test set 
pred_test = knn.predict(x_test)
#convert the output into a list to save the result into a dict in order to create the jsonl file
pred = pred_test.tolist()
dic = {}
dic['prediction'] = pred

  X = check_array(X, accept_sparse='csr')
  X = check_array(X, accept_sparse='csr')


In [43]:
pred = pred_test.tolist()

In [60]:
pred_conv = []
for p in pred:
    if p==1:
    p = 'SUPPORTS'
    else:
    p = 'REFUTES'
    pred_conv.append(p)

In [69]:
my_dict={}
for p in pred_conv:
    my_dict['answer']=p

In [70]:
my_dict

{'answer': 'SUPPORTS'}

In [51]:
with open('test_pred_1.json', 'w') as fp:
    json.dump(dictionary, fp)

**Put the predictions in a file named “test_set_pred_1.jsonl”**

In [None]:
# save the prediction into a file
with jsonlines.open('test_set_pred_1.jsonl', mode = 'w') as writer:
    writer.write(dic)