In [11]:
from scipy.stats import pearsonr
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import nltk, string
import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as jsc
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import numpy as np
from nltk.metrics import jaccard_distance
from sklearn.neural_network import MLPRegressor
from nltk.corpus import wordnet as wn
from nltk.tag import PerceptronTagger

import csv
import warnings
from autocorrect import spell
warnings.filterwarnings('ignore')

## Statement
- Use data set and description of task Semantic Textual Similarity in SemEval 2012.
- Implement some approaches to detect paraphrase using sentence similarity metrics.
    + Explore some lexical dimensions. (Only word)
    + Explore the syntactic dimension alone. (Word respect to sentence)
    + Explore the combination of both previous.
- Add new components at your choice (optional).
- Compare and comment the results achieved by these approaches among them and among the official results.
- Send files to raco in IHLT STS Project before the oral presentation:
    + Jupyter notebook: sts-[Student1]-[Student2].ipynb
    + Slides: sts-[Student1]-[Student2].pdf


In [8]:
train_path = '../data/train/'
test_path = '../data/test-gold/'

def load_and_concat(data_path):
    files = os.listdir(data_path)
    all_data = pd.DataFrame(columns=['sentence0','sentence1'])
    all_labels = pd.DataFrame(columns=['labels'])
    for file in files: 
        path = data_path + file
        if 'input' in file:
            print(path)
            fd = pd.read_csv(path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
            all_data = all_data.append(fd)
            fd = pd.read_csv(path.replace('input','gs'), sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
            all_labels = all_labels.append(fd,ignore_index=True)
    return all_data.reset_index(drop=True), all_labels.reset_index(drop=True)

train_df, train_gs = load_and_concat(train_path)
test_df, test_gs = load_and_concat(test_path)

train_df.shape, train_gs.shape,test_df.shape, test_gs.shape

../data/train/STS.input.MSRpar.txt
../data/train/STS.input.MSRvid.txt
../data/train/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.MSRpar.txt
../data/test-gold/STS.input.MSRvid.txt
../data/test-gold/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.surprise.SMTnews.txt
../data/test-gold/STS.input.surprise.OnWN.txt


((2234, 2), (2234, 1), (3108, 2), (3108, 1))

In [173]:
tagger = PerceptronTagger()
tagged = tagger.tag('The quick brown fox  do jumps over the lazy Michael Jackson and Barak Obama in Spain'.split())

In [275]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....


In [276]:
def sentence_lenght(s):
    return len(s.split())

def count_symbols(s):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(s,set(string.punctuation))

def count_shared_words(s0,s1):
    list3 = list(set(s0.lower().split())&set(s1.lower().split()))
    return len(list3)

def count_digits(s):
    numbers = sum(c.isdigit() for c in s)
    return numbers

def _get_word_synonyms(word):
    word_synonyms = []
    for synset in wordnet.synsets(word):
        for lemma in synset.lemma_names():
            word_synonyms.append(lemma)
    return word_synonyms

def synonim_words(a,b):
    return len(set(_get_word_synonyms(a))&set(_get_word_synonyms(b))) > 0

def count_synonims(s0,s1):
    sinonim = 0
    for a in s0.split():
        for b in s1.split():
            sinonim += synonim_words(a,b)
    return sinonim

def count_common_propper_nouns(s0,s1):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    s1_tags = tagger.tag(s1.split())
    NNP_s0 = [values[0] for values in s0_tags if values[1] =='NNP']
    NNP_s1 = [values[0] for values in s1_tags if values[1] =='NNP']
    return len(set(NNP_s0)&set(NNP_s1))

def count_nouns(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    NN_s0 = [values[0] for values in s0_tags if values[1] =='NN']
    return len(NN_s0)

def count_verbs(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    V_s0 = [values[0] for values in s0_tags if values[1] =='VBP']
    return len(V_s0)

def feature_extractor(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(auto_spell)
    features = pd.DataFrame(columns=['sentence_0_lengh','sentence_1_lengh',
                                    'number_of_nouns_s0', 'number_of_nouns_s1',
                                    'number_of_verbs_s0', 'number_of_verbs_s1',
                                    'number_of_symbols_s0','number_of_symbols_s1',
                                   'number_of_digits_s0','number_of_digits_1',
                                   'quantity_of_synonims','quantity_of_shared_words', 
                                    'proper_nouns_shared'])
    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        features.loc[index,'proper_nouns_shared'] = count_common_propper_nouns(s0,s1)
        features.loc[index,'quantity_of_shared_words'] = count_shared_words(s0,s1)
        features.loc[index,'quantity_of_synonims'] = count_synonims(s0,s1)
        features.loc[index,'sentence_0_lengh'] = sentence_lenght(s0)
        features.loc[index,'sentence_1_lengh'] = sentence_lenght(s1)
        features.loc[index,'number_of_nouns_s0'] = count_nouns(s0)
        features.loc[index,'number_of_nouns_s1'] = count_nouns(s1)
        features.loc[index,'number_of_verbs_s0'] = count_verbs(s0)
        features.loc[index,'number_of_verbs_s1'] = count_verbs(s1)
        features.loc[index,'number_of_symbols_s0'] = count_symbols(s0)
        features.loc[index,'number_of_symbols_s1'] = count_symbols(s1)
        features.loc[index,'number_of_digits_s0'] = count_digits(s0)
        features.loc[index,'number_of_digits_1'] = count_digits(s1)
    return features    

In [277]:
train_features = feature_extractor(train_df)

In [278]:
train_features.shape

(2234, 3)

In [279]:
test_features = feature_extractor(test_df)

In [280]:
test_features.shape

(3108, 3)

In [281]:
rfr = RandomForestRegressor(n_jobs=-1,n_estimators=1000)
train_and_test_model(rfr,train_features,test_features)

def print_feature_importance(rfr,train):
    importances=rfr.feature_importances_ ## get the feature importance
    # print("Original ",np.argsort(importances))
    indices = np.argsort(importances)[::-1]
    # print (" importances ",importances)
    # print (" indices ",indices)
    feat_labels = train.columns
    for f in range(train.shape[1]):
        print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
                                        importances[indices[f]]))
print_feature_importance(rfr,train_features)

train pearson:  0.7605462187289163
test pearson:  0.34148886186361094
 1) quantity_of_shared_words       0.792807
 2) quantity_of_synonims           0.148349
 3) proper_nouns_shared            0.058844


In [222]:
train_df.index

RangeIndex(start=0, stop=2234, step=1)

In [225]:
train_features = train_features.reset_index(drop=True)

In [229]:
pd.concat([train_df,train_features],axis=1).head()

Unnamed: 0,sentence0,sentence1,sentence_0_lengh,sentence_1_lengh,number_of_nouns_s0,number_of_nouns_s1,number_of_verbs_s0,number_of_verbs_s1,number_of_symbols_s0,number_of_symbols_s1,number_of_digits_s0,number_of_digits_1,quantity_of_synonims,quantity_of_shared_words,proper_nouns_shared
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,28.0,23.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,13.0,16.0,1.0
1,Micron has declared its first quarterly profit...,microns numbers also marked the first quarterl...,10.0,15.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0
2,The fines are part of failed Republican effort...,Perry said he backs the senates efforts includ...,16.0,16.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0
3,The American Anglican Council which represents...,The American Anglican Council which represents...,18.0,26.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,17.0,2.0
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,17.0,14.0,2.0,1.0,0.0,0.0,4.0,9.0,10.0,12.0,6.0,5.0,1.0


In [27]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....


In [26]:
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return wn.NOUN

def lemmatize_text(text):
    tagger = PerceptronTagger()
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    s_tokenized = w_tokenizer.tokenize(text)
    s_tagged = tagger.tag(s_tokenized)
    return [lemmatizer.lemmatize(w[0],penn_to_wn(w[1])) for w in s_tagged]

def auto_spell(text):
    corrected_text = ' '.join(spell(word) for word in text.split())
    return corrected_text

def preprocessing(data, return_array = False):
    # todo: better handling of na
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
        data[column] = data[column].str.replace('\d+', '')
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] = data[column].str.lower()
        # spell corrector 
        data[column] = data[column].apply(auto_spell)
        # lematize
        data[column] = data[column].apply(lemmatize_text)
        if not return_array:
            data[column] = data[column].str.join(' ')
    return data

In [27]:
text = 'playing videogames with friends at home was nice'
lemmatize_text(text)

['play', 'videogames', 'with', 'friend', 'at', 'home', 'be', 'nice']

In [64]:
train_df = preprocessing(train_df)
train_df.head()

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
0,but other source close to the sale said vivend...,but other source close to the sale said vivend...
1,micron ha declared it first quarterly profit f...,micron s number also marked the first quarterl...
2,the fine are part of failed republican effort ...,perry said he back the senate s effort includi...
3,the american anglican council which represents...,the american anglican council which represents...
4,the tech loaded nasdaq composite rose point to...,the technology laced nasdaq composite index Ix...


In [65]:
test_df = preprocessing(test_df)
test_df.head()

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
0,the problem likely will mean corrective change...,he said the problem need to be corrected befor...
1,the technology laced nasdaq composite index Ix...,the broad standard poor s index six inched up ...
2,it s a huge black eye said publisher arthur oc...,it s a huge black eye arthur sulzberger the ne...
3,sec chairman william Donaldson said there is a...,i think there s a building confidence that the...
4,vivendi share closed percent at euro in paris ...,in new york vivendi share were percent down at


### Lexical 

In [11]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess

guess_lex = lexical_simmilarity(train_df)
guess_lex.head()

Unnamed: 0,labels
0,0.533333
1,0.388889
2,0.333333
3,0.607143
4,0.227273


## TfidVectorizer

In [66]:


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_df['sentence1']) + list(train_df['sentence0']) )

def return_simil(a,b):
    simil = tfv.transform([a,b])
    return ((simil * simil.T).A)[0,1]

def calculate_all_sims(df):
    results = []
    for i in df.values:
        results.append(return_simil(i[0], i[1]))
    return results


all_sims = calculate_all_sims(train_df)

print('train pearson: ', pearsonr(all_sims, train_gs['labels'])[0])

test_sims = calculate_all_sims(test_df)
print('test pearson:', pearsonr(test_sims, test_gs['labels'])[0])

train pearson:  0.5002897027790068
test pearson: 0.5438793732392494


## Neural Networks

In [70]:
def test_model(model,xtrain,xtest):
    train_predicted =  model.predict(xtrain)
    test_predicted =   model.predict(xtest)
    print('train pearson: ', pearsonr(train_predicted, train_gs['labels'])[0])
    print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

def train_and_test_model(model, train,test):
    model.fit(train,train_gs)
    test_model(model,train,test)

In [71]:
merged_sentences = train_df['sentence0'] + train_df['sentence1']
merged_test = test_df['sentence0'] + test_df['sentence1']

vectorizer = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
merged_train = vectorizer.fit_transform(merged_sentences)
merged_test = vectorizer.transform(merged_test)

In [72]:
model_nn = MLPRegressor(hidden_layer_sizes=(2,),validation_fraction=0.3, alpha=0.3,warm_start=False)
train_and_test_model(model_nn,merged_train,merged_test)

train pearson:  0.9596686530934233
test pearson:  0.4275063427960064


In [74]:
parameters = {'alpha': 10.0 ** -np.arange(0, 5), 'max_iter':[200],
              'hidden_layer_sizes':np.arange(1, 5),'solver': ['lbfgs','adam']}
nn_cv = GridSearchCV(MLPRegressor(), parameters, n_jobs=-1)
nn_cv.fit(merged_train,train_gs.values.ravel())
train_and_test_model(nn_cv,merged_train,merged_test)

train pearson:  0.9954971083991321
test pearson:  0.5296553224941408


## Random Forest

In [83]:
rfr = RandomForestRegressor(n_jobs=-1,n_estimators=500)
train_and_test_model(rfr,merged_train,merged_test)

train pearson:  0.9678412965317825
test pearson:  0.5672059537739355


## SVM

In [77]:
svr = SVR()
train_and_test_model(svr,merged_train,merged_test)

train pearson:  0.670542730484757
test pearson:  0.5375810030024217


# Bag of words method

In [242]:
words = re.compile(r"\w+",re.I)
stemmer = PorterStemmer()

def tokenize_sentences(df):
    sentence_0_tokenized = []
    sentence_1_tokenized = []

    for q in df.sentence0.tolist():
        sentence_0_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    for q in df.sentence1.tolist():
        sentence_1_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    df["sentence_0_tok"] = sentence_0_tokenized
    df["sentence_1_tok"] = sentence_1_tokenized
    
    return df

def train_dictionary(df):
    
    sentences_tokenized = df.sentence_0_tok.tolist() + df.sentence_1_tok.tolist()
    
    dictionary = corpora.Dictionary(sentences_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary
    
tokenized_train = tokenize_sentences(train_df)
dictionary = train_dictionary(tokenized_train)
print ("No of words in the dictionary = %s" %len(dictionary))

tokenized_test = tokenize_sentences(test_df)

def get_vectors(df, dictionary):
    
    sentence0_vec = [dictionary.doc2bow(text) for text in df.sentence_0_tok.tolist()]
    sentence1_vec = [dictionary.doc2bow(text) for text in df.sentence_1_tok.tolist()]
    
    sentence0_csc = gensim.matutils.corpus2csc(sentence0_vec, num_terms=len(dictionary.token2id))
    sentence1_csc = gensim.matutils.corpus2csc(sentence1_vec, num_terms=len(dictionary.token2id))
    
    return sentence0_csc.transpose(),sentence1_csc.transpose()


q1_csc, q2_csc = get_vectors(tokenized_train, dictionary)
q1_csc_test, q2_csc_test = get_vectors(tokenized_test, dictionary)

print (q1_csc.shape)
print (q2_csc.shape)

No of words in the dictionary = 1795
(2234, 1795)
(2234, 1795)


In [243]:
train_bog = np.concatenate((q1_csc.todense(), q2_csc.todense()), axis=1)
test_bog = np.concatenate((q1_csc_test.todense(), q2_csc_test.todense()), axis=1)

In [244]:
train_and_test_model(rfr,train_bog,test_bog)

train pearson:  0.9688838957727722
test pearson:  0.5877396176512876


In [268]:
train_bog_extended = pd.concat([pd.DataFrame(train_bog),train_features[['quantity_of_shared_words']]],axis=1)
test_bog_extended = pd.concat([pd.DataFrame(test_bog),test_features[['quantity_of_shared_words']]],axis=1)

In [269]:
rfr = RandomForestRegressor(n_jobs=-1,n_estimators=1000)
train_and_test_model(rfr,train_bog_extended,test_bog_extended)

train pearson:  0.974709557469246
test pearson:  0.5262242718871012


In [270]:
importances=rfr.feature_importances_ ## get the feature importance
# print("Original ",np.argsort(importances))
indices = np.argsort(importances)[::-1]
# print (" importances ",importances)
# print (" indices ",indices)
feat_labels = train_bog_extended.columns
for f in range(train_bog_extended.shape[1]):
    print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
                                    importances[indices[f]]))


 1) quantity_of_shared_words       0.460581
 2) quantity_of_synonims           0.049911
 3) 256                            0.019747
 4) 49                             0.015885
 5) 90                             0.009220
 6) 163                            0.007472
 7) 2051                           0.007360
 8) 1885                           0.007120
 9) 143                            0.005915
10) 2246                           0.004802
11) 2676                           0.004709
12) 25                             0.004325
13) 2297                           0.004222
14) 1938                           0.004133
15) 451                            0.004076
16) 3117                           0.003774
17) 1322                           0.003608
18) 1820                           0.003496
19) 1330                           0.003198
20) 1844                           0.003181
21) 1631                           0.003059
22) 2587                           0.002918
23) 8                           

3006) 1734                           0.000002
3007) 855                            0.000002
3008) 331                            0.000002
3009) 1110                           0.000002
3010) 3440                           0.000002
3011) 1264                           0.000002
3012) 2113                           0.000002
3013) 573                            0.000002
3014) 2771                           0.000002
3015) 1565                           0.000002
3016) 2057                           0.000002
3017) 1496                           0.000002
3018) 1213                           0.000002
3019) 1575                           0.000002
3020) 2917                           0.000002
3021) 1001                           0.000002
3022) 3217                           0.000002
3023) 1311                           0.000002
3024) 2679                           0.000002
3025) 1668                           0.000002
3026) 3532                           0.000002
3027) 1902                        

## Try using all the distances

In [29]:
minkowski_dis = DistanceMetric.get_metric('minkowski')
mms_scale_man = MinMaxScaler()
mms_scale_euc = MinMaxScaler()
mms_scale_mink = MinMaxScaler()

def get_similarity_values(q1_csc, q2_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    jaccard_dis = []
    minkowsk_dis = []
    
    for i,j in zip(q1_csc, q2_csc):
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        i_ = i.toarray()
        j_ = j.toarray()
        try:
            sim = jsc(i_,j_)
            jaccard_dis.append(sim)
        except:
            jaccard_dis.append(0)
            
        sim = minkowski_dis.pairwise(i_,j_)
        minkowsk_dis.append(sim[0][0])
    
    return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis    


# cosine_sim = get_cosine_similarity(q1_csc, q2_csc)
cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis = get_similarity_values(q1_csc, q2_csc)

In [32]:
from sklearn.metrics import log_loss

def calculate_logloss(y_true, y_pred):
    loss_cal = log_loss(y_true, y_pred)
    return loss_cal

y_pred_cos, y_pred_man, y_pred_euc, y_pred_jac, y_pred_mink = get_similarity_values(q1_csc_test, q2_csc_test)
predictions = [y_pred_cos, y_pred_man, y_pred_euc, y_pred_jac, y_pred_mink]
for test_predicted in predictions:
    print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])



In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

X_train = pd.DataFrame({"cos" : cosine_sim, "man" : manhattan_dis, "euc" : eucledian_dis, "jac" : jaccard_dis, "min" : minkowsk_dis})

X_test = pd.DataFrame({"cos" : y_pred_cos, "man" : y_pred_man, "euc" : y_pred_euc, "jac" : y_pred_jac, "min" : y_pred_mink})

rfr = RandomForestRegressor()
rfr.fit(X_train,train_gs.values.ravel())

svr = SVR()
svr.fit(X_train,train_gs.values.ravel())

y_rfr_predicted = rfr.predict(X_test)
y_svr_predicted = svr.predict(X_test)

print('test pearson: ', pearsonr(y_rfr_predicted, test_gs['labels'])[0])
print('test pearson: ', pearsonr(y_svr_predicted, test_gs['labels'])[0])


test pearson:  0.5243535486642252
test pearson:  0.5157266341490466
