# Snippet Classification with Random Forest Classifier

In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [3]:
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score

In [4]:
set_config(display='diagram')

## Load snippet dataset

In [5]:
snippet_feat_OG = pd.read_csv('Features/all_snippet_feat.csv')
snippet_feat_OG

Unnamed: 0.1,Unnamed: 0,snippet,clicked,n_token,n_sent,to_NoPhr_C,as_NoPhr_C,at_NoPhr_C,ra_NoVeP_C,ra_NoSuP_C,...,as_FTree_C,at_FTree_C,TokSenM_S,TokSenS_S,TokSenL_S,as_Token_C,as_Sylla_C,at_Sylla_C,as_Chara_C,at_Chara_C
0,0,Use incoming webhooks to post messages to Matt...,1,28,6,10.0,1.666667,0.357143,1.111111,10.00,...,5.333333,1.142857,168.0,12.961481,1.859739,4.666667,8.000000,1.714286,36.000000,7.714286
1,1,"Apr 22, 2015 ... Today, Hello is rolling out f...",0,20,3,11.0,3.666667,0.550000,2.200000,0.00,...,8.333333,1.250000,60.0,7.745967,2.726833,6.666667,9.333333,1.400000,45.000000,6.750000
2,2,HEC tokens are sent in the headers of incoming...,0,38,5,14.0,2.800000,0.368421,1.555556,0.00,...,8.600000,1.131579,190.0,13.784049,2.260159,7.600000,12.200000,1.605263,56.400000,7.421053
3,3,Most college scholarships target incoming fres...,0,27,3,8.0,2.666667,0.296296,1.000000,0.00,...,11.000000,1.222222,81.0,9.000000,3.000000,9.000000,18.000000,2.000000,70.000000,7.777778
4,4,"It's up to you, whatever you find easier is fi...",0,24,5,9.0,1.800000,0.375000,0.818182,2.25,...,5.400000,1.125000,120.0,10.954451,1.974636,4.800000,7.200000,1.500000,41.600000,8.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10287,10287,Department Offices that Support STEM Open ED F...,1,20,4,12.0,3.000000,0.600000,4.000000,6.00,...,5.000000,1.000000,80.0,8.944272,2.160964,5.000000,10.000000,2.000000,32.000000,6.400000
10288,10288,STEM Kitchen & Garden is a farm-to-table resta...,0,20,2,9.0,4.500000,0.450000,3.000000,9.00,...,11.500000,1.150000,40.0,6.324555,4.321928,10.000000,16.500000,1.650000,63.000000,6.300000
10289,10289,Kids Definition of stem 1 : to develop as a c...,0,18,3,9.0,3.000000,0.500000,1.800000,0.00,...,8.000000,1.333333,54.0,7.348469,2.630930,6.000000,9.333333,1.555556,43.666667,7.277778
10290,10290,Stem Holdings has positioned itself as a leade...,0,18,2,10.0,5.000000,0.555556,2.000000,0.00,...,11.500000,1.277778,36.0,6.000000,4.169925,9.000000,16.500000,1.833333,67.000000,7.444444


We make a copy of our dataset to maintain the original version then we drop the column we don't need, and rename the `clicked` column to `class`

In [13]:
all_snippet_feat = snippet_feat_OG.copy()
all_snippet_feat.drop(columns = ['Unnamed: 0', 'snippet'], inplace = True, axis=1)
all_snippet_feat.rename(columns = {'clicked':'class'}, inplace = True)


In [14]:
all_snippet_feat

Unnamed: 0,class,n_token,n_sent,to_NoPhr_C,as_NoPhr_C,at_NoPhr_C,ra_NoVeP_C,ra_NoSuP_C,ra_NoPrP_C,ra_NoAjP_C,...,as_FTree_C,at_FTree_C,TokSenM_S,TokSenS_S,TokSenL_S,as_Token_C,as_Sylla_C,at_Sylla_C,as_Chara_C,at_Chara_C
0,1,28,6,10.0,1.666667,0.357143,1.111111,10.00,5.00,5.0,...,5.333333,1.142857,168.0,12.961481,1.859739,4.666667,8.000000,1.714286,36.000000,7.714286
1,0,20,3,11.0,3.666667,0.550000,2.200000,0.00,2.75,0.0,...,8.333333,1.250000,60.0,7.745967,2.726833,6.666667,9.333333,1.400000,45.000000,6.750000
2,0,38,5,14.0,2.800000,0.368421,1.555556,0.00,2.00,0.0,...,8.600000,1.131579,190.0,13.784049,2.260159,7.600000,12.200000,1.605263,56.400000,7.421053
3,0,27,3,8.0,2.666667,0.296296,1.000000,0.00,4.00,8.0,...,11.000000,1.222222,81.0,9.000000,3.000000,9.000000,18.000000,2.000000,70.000000,7.777778
4,0,24,5,9.0,1.800000,0.375000,0.818182,2.25,1.80,4.5,...,5.400000,1.125000,120.0,10.954451,1.974636,4.800000,7.200000,1.500000,41.600000,8.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10287,1,20,4,12.0,3.000000,0.600000,4.000000,6.00,12.00,0.0,...,5.000000,1.000000,80.0,8.944272,2.160964,5.000000,10.000000,2.000000,32.000000,6.400000
10288,0,20,2,9.0,4.500000,0.450000,3.000000,9.00,3.00,9.0,...,11.500000,1.150000,40.0,6.324555,4.321928,10.000000,16.500000,1.650000,63.000000,6.300000
10289,0,18,3,9.0,3.000000,0.500000,1.800000,0.00,1.50,0.0,...,8.000000,1.333333,54.0,7.348469,2.630930,6.000000,9.333333,1.555556,43.666667,7.277778
10290,0,18,2,10.0,5.000000,0.555556,2.000000,0.00,2.50,0.0,...,11.500000,1.277778,36.0,6.000000,4.169925,9.000000,16.500000,1.833333,67.000000,7.444444


## Split train, validate and test

In [15]:
X, y = all_snippet_feat.drop('class', axis=1), all_snippet_feat['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, stratify=y,
                                                   random_state=42)

In [16]:
X_train.shape, y_train.shape

((8233, 157), (8233,))

In [17]:
# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [18]:
X_train.shape, y_train.shape

((6586, 157), (6586,))

In [19]:
X_val.shape, y_val.shape

((1647, 157), (1647,))

## Initiate the model - RF

In [20]:
# Create a pipeline
%time
rfc_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42, n_jobs=-1))
])

rfc_pipeline.fit(X_train, y_train)

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 17.2 µs


In [21]:
# Fit the model
%time
pred_rfc = rfc_pipeline.predict(X_test)
cm_rf = confusion_matrix(y_true=y_test, y_pred=pred_rfc, labels= rfc_pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rfc_pipeline.classes_)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 17.2 µs


In [22]:
# Extract the metrices from confusion matrix
FP=cm_rf[0,1]
FN=cm_rf[1,0]
TP=cm_rf[1,1]
TN=cm_rf[0,0]

TPR=TP/(TP+FN)
TNR=TN/(TN+FP)
FPR=FP/(FP+TN)
FNR=FN/(TP+FN)

In [23]:
TPR, TNR, FPR, FNR

(0.7607223476297968,
 0.8934356351236147,
 0.10656436487638533,
 0.23927765237020315)

In [24]:
print(classification_report(y_true=y_test, y_pred=pred_rfc,
                            labels= rfc_pipeline.classes_, digits=3))

              precision    recall  f1-score   support

           0      0.832     0.893     0.861      1173
           1      0.844     0.761     0.800       886

    accuracy                          0.836      2059
   macro avg      0.838     0.827     0.831      2059
weighted avg      0.837     0.836     0.835      2059



## Different feature sets

In [25]:
syntax = ['to_NoPhr_C',
'as_NoPhr_C',
'at_NoPhr_C',
'ra_NoVeP_C',
'ra_NoSuP_C',
'ra_NoPrP_C',
'ra_NoAjP_C',
'ra_NoAvP_C',
'to_VePhr_C',
'as_VePhr_C',
'at_VePhr_C',
'ra_VeNoP_C',
'ra_VeSuP_C',
'ra_VePrP_C',
'ra_VeAjP_C',
'ra_VeAvP_C',
'to_SuPhr_C',
'as_SuPhr_C',
'at_SuPhr_C',
'ra_SuNoP_C',
'ra_SuVeP_C',
'ra_SuPrP_C',
'ra_SuAjP_C',
'ra_SuAvP_C',
'to_PrPhr_C',
'as_PrPhr_C',
'at_PrPhr_C',
'ra_PrNoP_C',
'ra_PrVeP_C',
'ra_PrSuP_C',
'ra_PrAjP_C',
'ra_PrAvP_C',
'to_AjPhr_C',
'as_AjPhr_C',
'at_AjPhr_C',
'ra_AjNoP_C',
'ra_AjVeP_C',
'ra_AjSuP_C',
'ra_AjPrP_C',
'ra_AjAvP_C',
'to_AvPhr_C',
'as_AvPhr_C',
'at_AvPhr_C',
'ra_AvNoP_C',
'ra_AvVeP_C',
'ra_AvSuP_C',
'ra_AvPrP_C',
'ra_AvAjP_C',
'to_NoTag_C',
'as_NoTag_C',
'at_NoTag_C',
'ra_NoAjT_C',
'ra_NoVeT_C',
'ra_NoAvT_C',
'ra_NoSuT_C',
'ra_NoCoT_C',
'to_VeTag_C',
'as_VeTag_C',
'at_VeTag_C',
'ra_VeAjT_C',
'ra_VeNoT_C',
'ra_VeAvT_C',
'ra_VeSuT_C',
'ra_VeCoT_C',
'to_AjTag_C',
'as_AjTag_C',
'at_AjTag_C',
'ra_AjNoT_C',
'ra_AjVeT_C',
'ra_AjAvT_C',
'ra_AjSuT_C',
'ra_AjCoT_C',
'to_AvTag_C',
'as_AvTag_C',
'at_AvTag_C',
'ra_AvAjT_C',
'ra_AvNoT_C',
'ra_AvVeT_C',
'ra_AvSuT_C',
'ra_AvCoT_C',
'to_SuTag_C',
'as_SuTag_C',
'at_SuTag_C',
'ra_SuAjT_C',
'ra_SuNoT_C',
'ra_SuVeT_C',
'ra_SuAvT_C',
'ra_SuCoT_C',
'to_CoTag_C',
'as_CoTag_C',
'at_CoTag_C',
'ra_CoAjT_C',
'ra_CoNoT_C',
'ra_CoVeT_C',
'ra_CoAvT_C',
'ra_CoSuT_C',
'to_ContW_C',
'as_ContW_C',
'at_ContW_C',
'to_FuncW_C',
'as_FuncW_C',
'at_FuncW_C',
'ra_CoFuW_C',
'to_TreeH_C',
'as_TreeH_C',
'at_TreeH_C',
'to_FTree_C',
'as_FTree_C',
'at_FTree_C']

len(syntax), len(syntax) == len(set(syntax))

(109, True)

In [26]:
lex_sem = ['SimpNoV_S',
'SquaNoV_S',
'CorrNoV_S',
'SimpVeV_S',
'SquaVeV_S',
'CorrVeV_S',
'SimpAjV_S',
'SquaAjV_S',
'CorrAjV_S',
'SimpAvV_S',
'SquaAvV_S',
'CorrAvV_S',
'SimpTTR_S',
'CorrTTR_S',
'BiLoTTR_S',
'UberTTR_S',
'MTLDTTR_S',
'to_AAKuW_C',
'as_AAKuW_C',
'at_AAKuW_C',
'to_AAKuL_C',
'as_AAKuL_C',
'at_AAKuL_C',
'to_AABiL_C',
'as_AABiL_C',
'at_AABiL_C',
'to_AABrL_C',
'as_AABrL_C',
'at_AABrL_C',
'to_AACoL_C',
'as_AACoL_C',
'at_AACoL_C']

len(lex_sem), len(lex_sem) == len(set(lex_sem))

(32, True)

In [27]:
shallow_trad = ['TokSenM_S',
'TokSenS_S',
'TokSenL_S',
'as_Token_C',
'as_Sylla_C',
'at_Sylla_C',
'as_Chara_C',
'at_Chara_C',     
'SmogInd_S',
'ColeLia_S',
'Gunning_S',
'AutoRea_S',
'FleschG_S',
'LinseaW_S']

len(shallow_trad), len(shallow_trad) == len(set(shallow_trad))

(14, True)

In [28]:
preprocess = ['n_token',
              'n_sent']

len(preprocess), len(preprocess) == len(set(preprocess))

(2, True)

In [32]:
all_snip_cols = list(X_train.columns)
len(all_snip_cols), len(all_snip_cols) == len(set(all_snip_cols))

(157, True)

In [44]:
# Confirm if the addition of all feat sets match the cols size
ASF = preprocess + shallow_trad + lex_sem + syntax # All Snippet Fteatures
len(ASF), len(ASF) == len(set(ASF))

(157, True)

**Note**: The hyper parameters were identified previously, so we now train and test the model on those parameters. 

In the following block of code, we fit the model with the best hyperparameters idenified

In [34]:
def train_and_evaluate_model(cols_of_interest, name):
    
    model_pipe = Pipeline(steps=[
        ('select_variable', FunctionTransformer(lambda df: df[cols_of_interest])),
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42, 
                                         n_jobs=-1,  
                                         n_estimators = 200, 
                                         criterion = 'entropy',
                                         max_depth = None,
                                         min_samples_split = 2,
                                         max_features = 'log2'))
        
    ])
    
    model_pipe.fit(X_train, y_train)
    
    #best_model=model_pipe.named_steps['crossvalidate'].best_params_
    
    n_feat=model_pipe.named_steps['model'].n_features_in_
    print(f'Number of Feature for {name}: {n_feat}')
    
    #make pred
    y_test_pred=model_pipe.predict(X_test)
    
    selectTP = (np.array(y_test)==1)&(np.array(y_test_pred)==1).astype(int)
    selectTN = (np.array(y_test)==0)&(np.array(y_test_pred)==0).astype(int)
    
    # save the predicted values to be used for statistical significance
    pickle.dump( y_test_pred, open( "snip_pred_results/snip_pred_" + str(name) + ".p", "wb" ))
    pickle.dump( selectTP, open( "snip_pred_results/snip_TP_" + str(name) + ".p", "wb" ))
    pickle.dump( selectTN, open( "snip_pred_results/snip_TN_" + str(name) + ".p", "wb" ))
    
    cm=confusion_matrix(y_true=y_test, y_pred=y_test_pred, labels= model_pipe.classes_)
    acc = accuracy_score(y_true=y_test, y_pred=y_test_pred)
    
    FP=cm[0,1]
    FN=cm[1,0]
    TP=cm[1,1]
    TN=cm[0,0]

    TPR=TP/(TP+FN)
    TNR=TN/(TN+FP)
    FPR=FP/(FP+TN)
    FNR=FN/(TP+FN)
    
    columns=['Type', 'Accuracy', 'TPR', 'TNR', 'FPR', 'FNR']
    entries=[[name, acc, TPR, TNR, FPR, FNR]]
    scores = pd.DataFrame(data=entries, columns=columns)
    
    return scores

## Individual feature set

In [38]:
# all_snip_cols, preprocess, shallow_trad, lex_sem, syntax

In [36]:
all_snip_res = train_and_evaluate_model(all_snip_cols, 'all_snip_feat')
preprocess_snip_res = train_and_evaluate_model(preprocess, "snip_preprocess")
shallow_trad_snip_res = train_and_evaluate_model(shallow_trad, "snip_shallow_trad")
lex_sem_snip_res = train_and_evaluate_model(lex_sem, "snip_lex_sem")
syntax_snip_res = train_and_evaluate_model(syntax, "snip_syntax")

Number of Feature for all_snip_feat: 157
Number of Feature for snip_preprocess: 2
Number of Feature for snip_shallow_trad: 14
Number of Feature for snip_lex_sem: 32
Number of Feature for snip_syntax: 109


In [37]:
results_indiv = pd.concat([all_snip_res, preprocess_snip_res, shallow_trad_snip_res, lex_sem_snip_res, syntax_snip_res], ignore_index=True)
results_indiv


Unnamed: 0,Type,Accuracy,TPR,TNR,FPR,FNR
0,all_snip_feat,0.839728,0.760722,0.899403,0.100597,0.239278
1,snip_preprocess,0.627975,0.563205,0.676897,0.323103,0.436795
2,snip_shallow_trad,0.816901,0.767494,0.85422,0.14578,0.232506
3,snip_lex_sem,0.838757,0.765237,0.894288,0.105712,0.234763
4,snip_syntax,0.838757,0.758465,0.899403,0.100597,0.241535


## Remove one feature set

In [39]:
# remove syntax
all_snip_feat_minus_syntax = [ele for ele in all_snip_cols if ele not in syntax]
# remove lex_sem 
all_snip_feat_minus_lex_sem = [ele for ele in all_snip_cols if ele not in lex_sem]
# remove shallow_trad
all_snip_feat_minus_shallow_trad = [ele for ele in all_snip_cols if ele not in shallow_trad]
# remove preprocess
all_snip_feat_minus_preprocess = [ele for ele in all_snip_cols if ele not in preprocess]

In [40]:
len(all_snip_feat_minus_syntax), len(all_snip_feat_minus_lex_sem), len(all_snip_feat_minus_shallow_trad), len(all_snip_feat_minus_preprocess)


(48, 125, 143, 155)

In [41]:
# all_snip_cols, preprocess, shallow_trad, lex_sem, syntax

In [42]:
all_snip_feat_res = train_and_evaluate_model(all_snip_cols, 'all_snip_feat') 
all_snip_feat_minus_syntax_res = train_and_evaluate_model(all_snip_feat_minus_syntax, 'all_snip_feat_minus_syntax')
all_snip_feat_minus_lex_sem_res = train_and_evaluate_model(all_snip_feat_minus_lex_sem, 'all_snip_feat_minus_lex_sem')
all_snip_feat_minus_shallow_trad_res = train_and_evaluate_model(all_snip_feat_minus_shallow_trad, 'all_snip_feat_minus_shallow_trad')
all_snip_feat_minus_preprocess_res = train_and_evaluate_model(all_snip_feat_minus_preprocess, 'all_snip_feat_minus_preprocess')


Number of Feature for all_snip_feat: 157
Number of Feature for all_snip_feat_minus_syntax: 48
Number of Feature for all_snip_feat_minus_lex_sem: 125
Number of Feature for all_snip_feat_minus_shallow_trad: 143
Number of Feature for all_snip_feat_minus_preprocess: 155


In [43]:
Results_rem = pd.concat([all_snip_feat_res,
                       all_snip_feat_minus_syntax_res, 
                       all_snip_feat_minus_lex_sem_res, 
                       all_snip_feat_minus_shallow_trad_res,
                       all_snip_feat_minus_preprocess_res], 
                      ignore_index = True)
Results_rem

Unnamed: 0,Type,Accuracy,TPR,TNR,FPR,FNR
0,all_snip_feat,0.839728,0.760722,0.899403,0.100597,0.239278
1,all_snip_feat_minus_syntax,0.834386,0.761851,0.889173,0.110827,0.238149
2,all_snip_feat_minus_lex_sem,0.841185,0.767494,0.896846,0.103154,0.232506
3,all_snip_feat_minus_shallow_trad,0.842156,0.76298,0.901961,0.098039,0.23702
4,all_snip_feat_minus_preprocess,0.838271,0.759594,0.897698,0.102302,0.240406
