# Title Classification with Random Forest Classifier

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score

In [3]:
set_config(display='diagram')

## Load title dataset

In [5]:
title_feat_OG = pd.read_csv('Features/all_title_feat.csv')
title_feat_OG

Unnamed: 0.1,Unnamed: 0,title,clicked,n_token,n_sent,to_NoPhr_C,as_NoPhr_C,at_NoPhr_C,ra_NoVeP_C,ra_NoSuP_C,...,as_FTree_C,at_FTree_C,TokSenM_S,TokSenS_S,TokSenL_S,as_Token_C,as_Sylla_C,at_Sylla_C,as_Chara_C,at_Chara_C
0,0,Incoming Webhooks — Mattermost 5.11 documentation,1,28,6,1.0,0.500000,0.200000,0.00,0.0,...,2.000000,0.800000,10.0,3.162278,2.321928,2.500000,6.500000,2.600000,22.000000,8.800000
1,1,“Hello” Is Facebook's New Android-Only Social ...,0,20,3,3.0,1.500000,0.333333,3.00,0.0,...,5.000000,1.111111,18.0,4.242641,3.169925,4.500000,7.500000,1.666667,27.000000,6.000000
2,2,Walkthrough | HTTP Event Collector,0,38,5,1.0,0.500000,0.200000,0.00,0.0,...,2.000000,0.800000,10.0,3.162278,2.321928,2.500000,3.500000,1.400000,15.000000,6.000000
3,3,College Scholarships - Scholarships.com,0,27,3,0.0,0.000000,0.000000,0.00,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,2.500000,1.666667,18.000000,12.000000
4,4,How to start sending email — Mailgun API docum...,0,24,5,1.0,0.500000,0.125000,0.25,1.0,...,4.000000,1.000000,16.0,4.000000,3.000000,4.000000,7.000000,1.750000,23.000000,5.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10287,10287,"Science, Technology, Engineering, and Math, in...",1,20,4,5.0,2.500000,0.625000,0.00,0.0,...,3.500000,0.875000,16.0,4.000000,3.000000,4.000000,8.000000,2.000000,29.500000,7.375000
10288,10288,Stem Kitchen SF,0,20,2,0.0,0.000000,0.000000,0.00,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,1.500000,1.000000,6.500000,4.333333
10289,10289,Stem Definition & Meaning - Merriam-Webster,0,18,3,3.0,1.500000,0.500000,1.50,0.0,...,2.500000,0.833333,12.0,3.464102,2.584963,3.000000,5.500000,1.833333,19.000000,6.333333
10290,10290,Driven By Stem,0,18,2,0.0,0.000000,0.000000,0.00,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,1.000000,0.666667,6.000000,4.000000


We make a copy of our dataset to maintain the original version then we drop the column we don't need, and rename the `clicked` column to `class`

In [9]:
all_title_feat = title_feat_OG.copy()
all_title_feat.drop(columns = ['Unnamed: 0', 'title'], inplace = True, axis=1)
all_title_feat.rename(columns = {'clicked':'class'}, inplace = True)


In [10]:
all_title_feat

Unnamed: 0,class,n_token,n_sent,to_NoPhr_C,as_NoPhr_C,at_NoPhr_C,ra_NoVeP_C,ra_NoSuP_C,ra_NoPrP_C,ra_NoAjP_C,...,as_FTree_C,at_FTree_C,TokSenM_S,TokSenS_S,TokSenL_S,as_Token_C,as_Sylla_C,at_Sylla_C,as_Chara_C,at_Chara_C
0,1,28,6,1.0,0.500000,0.200000,0.00,0.0,0.0,0.0,...,2.000000,0.800000,10.0,3.162278,2.321928,2.500000,6.500000,2.600000,22.000000,8.800000
1,0,20,3,3.0,1.500000,0.333333,3.00,0.0,0.0,0.0,...,5.000000,1.111111,18.0,4.242641,3.169925,4.500000,7.500000,1.666667,27.000000,6.000000
2,0,38,5,1.0,0.500000,0.200000,0.00,0.0,0.0,0.0,...,2.000000,0.800000,10.0,3.162278,2.321928,2.500000,3.500000,1.400000,15.000000,6.000000
3,0,27,3,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,2.500000,1.666667,18.000000,12.000000
4,0,24,5,1.0,0.500000,0.125000,0.25,1.0,0.0,0.0,...,4.000000,1.000000,16.0,4.000000,3.000000,4.000000,7.000000,1.750000,23.000000,5.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10287,1,20,4,5.0,2.500000,0.625000,0.00,0.0,5.0,0.0,...,3.500000,0.875000,16.0,4.000000,3.000000,4.000000,8.000000,2.000000,29.500000,7.375000
10288,0,20,2,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,1.500000,1.000000,6.500000,4.333333
10289,0,18,3,3.0,1.500000,0.500000,1.50,0.0,0.0,0.0,...,2.500000,0.833333,12.0,3.464102,2.584963,3.000000,5.500000,1.833333,19.000000,6.333333
10290,0,18,2,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,...,0.000000,0.000000,6.0,2.449490,1.584963,1.500000,1.000000,0.666667,6.000000,4.000000


## Different feature sets

In [28]:
syntax = ['to_NoPhr_C',
'as_NoPhr_C',
'at_NoPhr_C',
'ra_NoVeP_C',
'ra_NoSuP_C',
'ra_NoPrP_C',
'ra_NoAjP_C',
'ra_NoAvP_C',
'to_VePhr_C',
'as_VePhr_C',
'at_VePhr_C',
'ra_VeNoP_C',
'ra_VeSuP_C',
'ra_VePrP_C',
'ra_VeAjP_C',
'ra_VeAvP_C',
'to_SuPhr_C',
'as_SuPhr_C',
'at_SuPhr_C',
'ra_SuNoP_C',
'ra_SuVeP_C',
'ra_SuPrP_C',
'ra_SuAjP_C',
'ra_SuAvP_C',
'to_PrPhr_C',
'as_PrPhr_C',
'at_PrPhr_C',
'ra_PrNoP_C',
'ra_PrVeP_C',
'ra_PrSuP_C',
'ra_PrAjP_C',
'ra_PrAvP_C',
'to_AjPhr_C',
'as_AjPhr_C',
'at_AjPhr_C',
'ra_AjNoP_C',
'ra_AjVeP_C',
'ra_AjSuP_C',
'ra_AjPrP_C',
'ra_AjAvP_C',
'to_AvPhr_C',
'as_AvPhr_C',
'at_AvPhr_C',
'ra_AvNoP_C',
'ra_AvVeP_C',
'ra_AvSuP_C',
'ra_AvPrP_C',
'ra_AvAjP_C',
'to_NoTag_C',
'as_NoTag_C',
'at_NoTag_C',
'ra_NoAjT_C',
'ra_NoVeT_C',
'ra_NoAvT_C',
'ra_NoSuT_C',
'ra_NoCoT_C',
'to_VeTag_C',
'as_VeTag_C',
'at_VeTag_C',
'ra_VeAjT_C',
'ra_VeNoT_C',
'ra_VeAvT_C',
'ra_VeSuT_C',
'ra_VeCoT_C',
'to_AjTag_C',
'as_AjTag_C',
'at_AjTag_C',
'ra_AjNoT_C',
'ra_AjVeT_C',
'ra_AjAvT_C',
'ra_AjSuT_C',
'ra_AjCoT_C',
'to_AvTag_C',
'as_AvTag_C',
'at_AvTag_C',
'ra_AvAjT_C',
'ra_AvNoT_C',
'ra_AvVeT_C',
'ra_AvSuT_C',
'ra_AvCoT_C',
'to_SuTag_C',
'as_SuTag_C',
'at_SuTag_C',
'ra_SuAjT_C',
'ra_SuNoT_C',
'ra_SuVeT_C',
'ra_SuAvT_C',
'ra_SuCoT_C',
'to_CoTag_C',
'as_CoTag_C',
'at_CoTag_C',
'ra_CoAjT_C',
'ra_CoNoT_C',
'ra_CoVeT_C',
'ra_CoAvT_C',
'ra_CoSuT_C',
'to_ContW_C',
'as_ContW_C',
'at_ContW_C',
'to_FuncW_C',
'as_FuncW_C',
'at_FuncW_C',
'ra_CoFuW_C',
'to_TreeH_C',
'as_TreeH_C',
'at_TreeH_C',
'to_FTree_C',
'as_FTree_C',
'at_FTree_C']

len(syntax), len(syntax) == len(set(syntax))

(109, True)

In [29]:
lex_sem = ['SimpNoV_S',
'SquaNoV_S',
'CorrNoV_S',
'SimpVeV_S',
'SquaVeV_S',
'CorrVeV_S',
'SimpAjV_S',
'SquaAjV_S',
'CorrAjV_S',
'SimpAvV_S',
'SquaAvV_S',
'CorrAvV_S',
'SimpTTR_S',
'CorrTTR_S',
'BiLoTTR_S',
'UberTTR_S',
'MTLDTTR_S',
'to_AAKuW_C',
'as_AAKuW_C',
'at_AAKuW_C',
'to_AAKuL_C',
'as_AAKuL_C',
'at_AAKuL_C',
'to_AABiL_C',
'as_AABiL_C',
'at_AABiL_C',
'to_AABrL_C',
'as_AABrL_C',
'at_AABrL_C',
'to_AACoL_C',
'as_AACoL_C',
'at_AACoL_C']

len(lex_sem), len(lex_sem) == len(set(lex_sem))

(32, True)

In [30]:
shallow_trad = ['TokSenM_S',
'TokSenS_S',
'TokSenL_S',
'as_Token_C',
'as_Sylla_C',
'at_Sylla_C',
'as_Chara_C',
'at_Chara_C',     
'SmogInd_S',
'ColeLia_S',
'Gunning_S',
'AutoRea_S',
'FleschG_S',
'LinseaW_S']

len(shallow_trad), len(shallow_trad) == len(set(shallow_trad))

(14, True)

In [31]:
preprocess = ['n_token',
              'n_sent']

len(preprocess), len(preprocess) == len(set(preprocess))

(2, True)

In [32]:
all_tit_cols = list(X_train.columns)
len(all_tit_cols), len(all_tit_cols) == len(set(all_tit_cols))

(157, True)

In [34]:
# Confirm if the addition of all feat sets match the cols size
ATF = preprocess + shallow_trad + lex_sem + syntax # All Title Fteatures
len(ATF), len(ATF) == len(set(ATF))

(157, True)

## Split train, validate and test

In [12]:
X, y = all_title_feat.drop('class', axis=1), all_title_feat['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, stratify=y,
                                                   random_state=42)

In [13]:
X_train.shape, y_train.shape

((8233, 157), (8233,))

In [14]:
# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [15]:
X_train.shape, y_train.shape

((6586, 157), (6586,))

# --?? hat to do with val sample. Tune HP??-- # 

In [16]:
X_val.shape, y_val.shape

((1647, 157), (1647,))

## Initiate the model - RF

In [19]:
# Create a pipeline
%time
rfc_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42, n_jobs=-1))
])

rfc_pipeline.fit(X_train, y_train)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs


In [21]:
# Fit the model
%time
pred_rfc = rfc_pipeline.predict(X_test)
cm_rf = confusion_matrix(y_true=y_test, y_pred=pred_rfc, labels= rfc_pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rfc_pipeline.classes_)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 14.8 µs


In [22]:
# Extract the metrices from confusion matrix
FP=cm_rf[0,1]
FN=cm_rf[1,0]
TP=cm_rf[1,1]
TN=cm_rf[0,0]

TPR=TP/(TP+FN)
TNR=TN/(TN+FP)
FPR=FP/(FP+TN)
FNR=FN/(TP+FN)

In [23]:
TPR, TNR, FPR, FNR

(0.7573363431151241,
 0.8635976129582268,
 0.13640238704177324,
 0.24266365688487584)

In [24]:
print(classification_report(y_true=y_test, y_pred=pred_rfc,
                            labels= rfc_pipeline.classes_, digits=3))

              precision    recall  f1-score   support

           0      0.825     0.864     0.844      1173
           1      0.807     0.757     0.782       886

    accuracy                          0.818      2059
   macro avg      0.816     0.810     0.813      2059
weighted avg      0.817     0.818     0.817      2059



**Note**: The hyper parameters were identified previously, so we now train and test the model on those parameters. 

In the following block of code, we fit the model with the best hyperparameters idenified

In [27]:
def train_and_evaluate_model(cols_of_interest, name):
    
    model_pipe = Pipeline(steps=[
        ('select_variable', FunctionTransformer(lambda df: df[cols_of_interest])),
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42, 
                                         n_jobs=-1,  
                                         n_estimators = 200, 
                                         criterion = 'entropy',
                                         max_depth = None,
                                         min_samples_split = 2,
                                         max_features = 'log2'))
        
    ])
    
    model_pipe.fit(X_train, y_train)
    
    #best_model=model_pipe.named_steps['crossvalidate'].best_params_
    
    n_feat=model_pipe.named_steps['model'].n_features_in_
    print(f'Number of Feature for {name}: {n_feat}')
    
    #make pred
    y_test_pred=model_pipe.predict(X_test)
    
    selectTP = (np.array(y_test)==1)&(np.array(y_test_pred)==1).astype(int)
    selectTN = (np.array(y_test)==0)&(np.array(y_test_pred)==0).astype(int)
    
    # save the predicted values to be used for statistical significance
    pickle.dump( y_test_pred, open( "tit_pred_results/tit_pred_" + str(name) + ".p", "wb" ))
    pickle.dump( selectTP, open( "tit_pred_results/tit_TP_" + str(name) + ".p", "wb" ))
    pickle.dump( selectTN, open( "tit_pred_results/tit_TN_" + str(name) + ".p", "wb" ))
    
    cm=confusion_matrix(y_true=y_test, y_pred=y_test_pred, labels= model_pipe.classes_)
    acc = accuracy_score(y_true=y_test, y_pred=y_test_pred)
    
    FP=cm[0,1]
    FN=cm[1,0]
    TP=cm[1,1]
    TN=cm[0,0]

    TPR=TP/(TP+FN)
    TNR=TN/(TN+FP)
    FPR=FP/(FP+TN)
    FNR=FN/(TP+FN)
    
    columns=['Type', 'Accuracy', 'TPR', 'TNR', 'FPR', 'FNR']
    entries=[[name, acc, TPR, TNR, FPR, FNR]]
    scores = pd.DataFrame(data=entries, columns=columns)
    
    return scores

## Individual feature set

In [None]:
# all_tit_cols, preprocess, shallow_trad, lex_sem, syntax

In [44]:
all_tit_res = train_and_evaluate_model(all_tit_cols, 'all_tit_feat')
preprocess_tit_res = train_and_evaluate_model(preprocess, "tit_preprocess")
shallow_trad_tit_res = train_and_evaluate_model(shallow_trad, "tit_shallow_trad")
lex_sem_tit_res = train_and_evaluate_model(lex_sem, "tit_lex_sem")
syntax_tit_res = train_and_evaluate_model(syntax, "tit_syntax")

Number of Feature for all_tit_feat: 157
Number of Feature for tit_preprocess: 2
Number of Feature for tit_shallow_trad: 14
Number of Feature for tit_lex_sem: 32
Number of Feature for tit_syntax: 109


In [45]:
results_indv = pd.concat([all_tit_res, preprocess_tit_res, shallow_trad_tit_res, lex_sem_tit_res, syntax_tit_res], ignore_index=True)
results_indv

Unnamed: 0,Type,Accuracy,TPR,TNR,FPR,FNR
0,all_tit_feat,0.822729,0.758465,0.87127,0.12873,0.241535
1,tit_preprocess,0.627975,0.563205,0.676897,0.323103,0.436795
2,tit_shallow_trad,0.791161,0.755079,0.818414,0.181586,0.244921
3,tit_lex_sem,0.805245,0.742664,0.852515,0.147485,0.257336
4,tit_syntax,0.769305,0.69526,0.825234,0.174766,0.30474


## Remove one feature set

In [37]:
# remove syntax
all_tit_feat_minus_syntax = [ele for ele in all_tit_cols if ele not in syntax]
# remove lex_sem 
all_tit_feat_minus_lex_sem = [ele for ele in all_tit_cols if ele not in lex_sem]
# remove shallow_trad
all_tit_feat_minus_shallow_trad = [ele for ele in all_tit_cols if ele not in shallow_trad]
# remove preprocess
all_tit_feat_minus_preprocess = [ele for ele in all_tit_cols if ele not in preprocess]

In [39]:
len(all_tit_feat_minus_syntax), len(all_tit_feat_minus_lex_sem), len(all_tit_feat_minus_shallow_trad), len(all_tit_feat_minus_preprocess)


(48, 125, 143, 155)

In [40]:
# all_tit_cols, preprocess, shallow_trad, lex_sem, syntax

In [48]:
all_tit_feat_res = train_and_evaluate_model(all_tit_cols, 'all_tit_feat') 
all_tit_feat_minus_syntax_res = train_and_evaluate_model(all_tit_feat_minus_syntax, 'all_tit_feat_minus_syntax')
all_tit_feat_minus_lex_sem_res = train_and_evaluate_model(all_tit_feat_minus_lex_sem, 'all_tit_feat_minus_lex_sem')
all_tit_feat_minus_shallow_trad_res = train_and_evaluate_model(all_tit_feat_minus_shallow_trad, 'all_tit_feat_minus_shallow_trad')
all_tit_feat_minus_preprocess_res = train_and_evaluate_model(all_tit_feat_minus_preprocess, 'all_tit_feat_minus_preprocess')


Number of Feature for all_tit_feat: 157
Number of Feature for all_tit_feat_minus_syntax: 48
Number of Feature for all_tit_feat_minus_lex_sem: 125
Number of Feature for all_tit_feat_minus_shallow_trad: 143
Number of Feature for all_tit_feat_minus_preprocess: 155


In [49]:
Results_rem = pd.concat([all_tit_feat_res,
                       all_tit_feat_minus_syntax_res, 
                       all_tit_feat_minus_lex_sem_res, 
                       all_tit_feat_minus_shallow_trad_res,
                       all_tit_feat_minus_preprocess_res], 
                      ignore_index = True)
Results_rem

Unnamed: 0,Type,Accuracy,TPR,TNR,FPR,FNR
0,all_tit_feat,0.822729,0.758465,0.87127,0.12873,0.241535
1,all_tit_feat_minus_syntax,0.825644,0.760722,0.87468,0.12532,0.239278
2,all_tit_feat_minus_lex_sem,0.818844,0.766366,0.858483,0.141517,0.233634
3,all_tit_feat_minus_shallow_trad,0.823215,0.759594,0.87127,0.12873,0.240406
4,all_tit_feat_minus_preprocess,0.815444,0.744921,0.868713,0.131287,0.255079
