# Benchmarking a TFIDF Model
> Sample code to run prediction and evaluate a TFIDF model

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

df_update = pd.read_csv("dataset_mRFP.csv")
df_update.head()
sequence= df_update['Sequence'].values.tolist()

In [None]:
from utils.fragmentation import KmerFragmenter

## Creating Fragments
> Use helper functions provided in `utils`

In [None]:
import sys

sys.path.append("..")

In [None]:
import numpy as np
from utils.fragmentation import KmerFragmenter

fragmenter = KmerFragmenter()
sequences = df_update['Sequence'].values.tolist()
y = df_update['Value'].values.tolist()
fragments= fragmenter.split_words(sequences,3,1)

In [None]:
import os
os.getcwd()

## TFIDF
> Uses `sklearn` code in the background

In [None]:
from utils.vectorizer import Vectorizer

vector_space_embedder = Vectorizer()
mod, docs_tfidf, vector_stack_tfidf= vector_space_embedder.create_tfidf_stack(fragments)

## Tuning setup
> Perform gridsearch for optimal hyperparameter settings

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

def tune_forest(y,vector_stack):
    model = RandomForestRegressor(random_state=30)
    param_grid = { 
                "n_estimators"      : [100, 150],
                "criterion"         : ["squared_error", "absolute_error", "poisson"],
                "max_features"      : [5,10],
                "max_depth"         : [5,10, 20],
                "min_samples_split" : [2, 4,10] ,
                "bootstrap": [True, False]
    }
    grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=5)
    grid_search.fit(vector_stack, y)
    return grid_search.best_params_

In [None]:
tune_tfidf = tune_forest(y, vector_stack_tfidf)

In [None]:
tune_tfidf

## Benchmark system

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy import stats

def train_test_acc(X_train,y_train,model):
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
    
    spr = stats.spearmanr(y_test, pred_values)[0]
    acc = mean_squared_error(y_test, pred_values)
    return acc, spr

## Experiment
> Run multiple experiments with different seeds

In [None]:
model_tfidf = RandomForestRegressor(**tune_tfidf)

splt = 0.25
k = 100
df_marks = pd.DataFrame(columns=['name', 'acc','spr',"k"])
row_list= []
for run in tqdm(range(0,k)):
    X_train, X_test, y_train, y_test = train_test_split(vector_stack_tfidf, y, test_size=splt,random_state=k)
    acc, spr = train_test_acc(X_train,y_train,model=model_tfidf)
    new_row = {'name':'rna_tfidf', 'acc':acc, 'spr':spr,'k':run}
    row_list.append(new_row)
    
df_marks = pd.DataFrame(row_list)
df = df_marks.pivot(index='k', columns='name', values='acc')
df.head()

In [None]:
df_acc = df_marks.pivot(index='k', columns='name', values='spr')
df_acc.head()

## Plot the data

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 300
df_acc.plot.kde(figsize=(2.5, 1.5),ylim=(0,60))

In [None]:
df_acc.describe()

In [None]:
df_loss = df_marks.pivot(index='k', columns='name', values='acc')
df_loss.plot.kde(figsize=(5, 3), subplots=True,ylim=(0,200));

In [None]:
df_loss.describe()