# Nested 5-Fold Cross Validation For Random Forest On Both Textual+Contextual Features

In [1]:
import numpy as np
import pandas as pd
import xlrd as xl
from pandas import ExcelWriter
from pandas import ExcelFile
import pprint
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import re
import pickle
from operator import itemgetter
import time, datetime
from functools import partial, update_wrapper
from openpyxl import load_workbook
from copy import deepcopy

from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN 

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from imblearn.pipeline import Pipeline as Imb_Pipeline

from sklearn.preprocessing import FunctionTransformer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, make_scorer, confusion_matrix

pp = pprint.PrettyPrinter(indent=4)

## Ignore warnings
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
np.warnings.filterwarnings('ignore')

#### Use spaCy parser for word tokenization of a sentence:

In [2]:
import spacy
eng = spacy.load('en')
from spacy.lang.en import English
parser = English()

#### Define stopwords as punctuation + common contractions:

In [3]:
from string import punctuation
from nltk.corpus import stopwords

stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] #+ stopwords.words('english') 

#### Code to lemmatize and tokenize:

In [4]:
def get_lemma(item):
    return WordNetLemmatizer().lemmatize(item)

def tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            line_tokens.append('URL')
        elif token.orth_.startswith('@'):
            line_tokens.append('SCREEN_NAME')
        elif str(token) not in stop_words:
            line_tokens.append(get_lemma(token.lower_))
    return line_tokens

#### Read from the pickled file:

In [5]:
all_data = pd.read_pickle('../../data/text_conv_data.pkl')

print("Size of corpus: "+str(len(all_data)))
print("Feature Size: "+str(len(list(all_data.columns.values))-1))

Size of corpus: 4330
Feature Size: 23


In [6]:
X = all_data.drop(['Code','Document'], axis=1)
y = all_data[['Code']]

print("Number of unique labels: "+str(len(set(y['Code']))))

labels = list(set(y['Code']))
labels.sort()

pp.pprint(labels)

Number of unique labels: 13
[   'Action on Issue',
    'Bug Reproduction',
    'Contribution and Commitment',
    'Expected Behaviour',
    'Investigation and Exploration',
    'Social Conversation',
    'Motivation',
    'Observed Bug Behaviour',
    'Potential New Issues and Requests',
    'Solution Discussion',
    'Task Progress',
    'Solution Usage',
    'Workarounds']


# Nested Cross-Validation on Random Forest:

In [7]:
# To be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# To be used in outer CV
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

non_transform_features = [col for col in X.columns.values if col not in ['Text Content']]

## Define pandas-compatible feature unificator

This is just a pandas compatible feature unificator.

This is necessary because we have two types of features: textual + conversational

Hence, we first use tf-idf to vectorize the textual content and then append the conversational features to this vector using the feature unificator.

In [9]:
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())
        if not result:
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [10]:
class PandasTransform(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None, copy=None):
        return X.loc[:, self.columns].astype(float)

## Define Pipelines:

1. Pipeline1: tfidf + class_weight=balanced
2. Pipeline2: tfidf + SMOTE

In [11]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, x, y=None):
        return self
    def transform(self, data_array):
        return data_array[:, self.columns]

### Simply select the conversational features, and tfidf-vectorize the textual content
ling_features = PandasTransform(non_transform_features)
tfidf_text_features = Pipeline([('extract_field',FunctionTransformer(lambda x: x['Text Content'],validate=False)),('vect',TfidfVectorizer(tokenizer=tokenize))])

feature_union1 = PandasFeatureUnion([
    ('selector',ling_features),
    ('text_vectorizer',tfidf_text_features)
])

### Combine the two types of feature vectors
pipeline1 = Pipeline([
    ('features',feature_union1),
    ('clf', RandomForestClassifier(class_weight='balanced')),
])
pipeline2 = Imb_Pipeline([
    ('features', feature_union1),
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier())
])

### Hyperparameters to search

parameters = {
    'features__text_vectorizer__vect__ngram_range': ((1,1),(1,2)),  # unigrams or bigrams or trigrams
    'clf__n_estimators': (10, 50, 100),
    'clf__min_samples_split': (2, 5, 10),
}

In [12]:
### Define and create the scoring functions
def score_func(y_true, y_pred, score_index, i):
    return(precision_recall_fscore_support(y_true,y_pred)[score_index][i])

def avg_score(y_true, y_pred, score_index):
    return precision_recall_fscore_support(y_true,y_pred,average='weighted')[score_index]

def sum_support(y_true, y_pred):
    return len(y_true)

### Create partials for each of the metrics returned
score_funcs = {v: partial(score_func, score_index=k) for k, v in {0:'precision',1:'recall',2:'fscore',3:'support'}.items()}
prec_score = partial(score_func, score_index=0)
update_wrapper(prec_score,score_func)
rec_score = partial(score_func, score_index=1)
update_wrapper(rec_score,score_func)
f_score = partial(score_func, score_index=2)
update_wrapper(f_score,score_func)
support_score = partial(score_func, score_index=3)
update_wrapper(support_score,score_func)

### Create a callable scoring function for each of the metrics for each classification label
scorer = {}
for label_id in range(0,13):
    scorer['label'+str(label_id)+'_precision'] = make_scorer(prec_score, i=label_id)
    scorer['label'+str(label_id)+'_recall'] = make_scorer(rec_score, i=label_id)
    scorer['label'+str(label_id)+'_fscore'] = make_scorer(f_score, i=label_id)
    scorer['label'+str(label_id)+'_support'] = make_scorer(support_score, i=label_id)

### Create a callable scoring function for avg/total of the metrics across classification labels
scorer['avg_precision'] = make_scorer(avg_score,score_index=0)
scorer['avg_recall'] = make_scorer(avg_score,score_index=1)
scorer['avg_fscore'] = make_scorer(avg_score,score_index=2)
scorer['total_support'] = make_scorer(sum_support)

### Perform Nested cross-validation on Pipeline1
start = time.time()
clf1 = GridSearchCV(pipeline1, parameters, cv=inner_cv, scoring='f1_weighted')
clf1_results = cross_validate(clf1, X=X, y=y, cv=outer_cv, scoring=scorer)
print("Completed Pipeline1 scenario in "+ str(datetime.timedelta(seconds=(time.time()-start))))

### Perform Nested cross-validation on Pipeline2
start = time.time()
clf2 = GridSearchCV(pipeline2, parameters, cv=inner_cv, scoring='f1_weighted')
clf2_results = cross_validate(clf2, X=X, y=y, cv=outer_cv, scoring=scorer)
print("Completed Pipeline2 scenario in "+ str(datetime.timedelta(seconds=(time.time()-start))))

Completed Pipeline1 scenario in 1:49:25.520462
Completed Pipeline2 scenario in 4:36:24.055539


# Display and Save Training and Testing Results for each Fold:

## Pipeline1 Results:

In [21]:
train_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])
test_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])

result_dict = {}

# writer = pd.ExcelWriter('../../results/5FOLD.xlsx')
book = load_workbook('../../results/5FOLD.xlsx')
writer = pd.ExcelWriter('../../results/5FOLD.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

datalength = 0

for i in range(0,5):
    for label_id in range(0,13):
        train_report.loc[labels[label_id], :] = [clf1_results['train_label'+str(label_id)+'_precision'][i],clf1_results['train_label'+str(label_id)+'_recall'][i],clf1_results['train_label'+str(label_id)+'_fscore'][i],clf1_results['train_label'+str(label_id)+'_support'][i]]
        test_report.loc[labels[label_id], :] = [clf1_results['test_label'+str(label_id)+'_precision'][i],clf1_results['test_label'+str(label_id)+'_recall'][i],clf1_results['test_label'+str(label_id)+'_fscore'][i],clf1_results['test_label'+str(label_id)+'_support'][i]]

    train_report.loc['Avg/Total', :] = [clf1_results['train_avg_precision'][i],clf1_results['train_avg_recall'][i],clf1_results['train_avg_fscore'][i],clf1_results['train_total_support'][i]]
    test_report.loc['Avg/Total', :] = [clf1_results['test_avg_precision'][i],clf1_results['test_avg_recall'][i],clf1_results['test_avg_fscore'][i],clf1_results['test_total_support'][i]]

    fold_index = pd.DataFrame(data=[{'Fold':'Fold '+str(i)}])
    fold_index.to_excel(writer,'RBC',startrow=datalength, index=False)
    datalength+=(len(fold_index)+2)
    train_report.to_excel(writer,'RBC',startrow=datalength)
    datalength+=(len(train_report)+2)
    test_report.to_excel(writer,'RBC',startrow=datalength)
    datalength+=(len(test_report)+2)

    result_dict['RBC_train_'+str(i)] = train_report
    result_dict['RBC_test_'+str(i)] = test_report

    train_report = train_report.astype(float).round(2)
    test_report = test_report.astype(float).round(2)

    print("\n------------------------- FOLD "+str(i)+": -------------------------")
    print("\nTraining Results:")
    print(train_report)
    print("\nTest Results:")
    print(test_report)

writer.save()


------------------------- FOLD 0: -------------------------

Training Results:
                                   Precision  Recall  F1-score  Support
Action on Issue                         0.96    1.00      0.98      48.0
Bug Reproduction                        0.99    0.99      0.99     196.0
Contribution and Commitment             1.00    1.00      1.00      66.0
Expected Behaviour                      1.00    1.00      1.00      99.0
Investigation and Exploration           0.98    0.98      0.98     301.0
Social Conversation                     0.98    0.99      0.98     638.0
Motivation                              0.97    1.00      0.98     230.0
Observed Bug Behaviour                  0.90    1.00      0.95     104.0
Potential New Issues and Requests       0.99    1.00      1.00     184.0
Solution Discussion                     1.00    0.97      0.98    1128.0
Task Progress                           0.98    1.00      0.99     100.0
Solution Usage                          0.98 

## Pipeline2 Results:

In [22]:
train_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])
test_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])

result_dict = {}

# writer = pd.ExcelWriter('../../results/5FOLD.xlsx')
book = load_workbook('../../results/5FOLD.xlsx')
writer = pd.ExcelWriter('../../results/5FOLD.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

datalength = 0

for i in range(0,5):
    for label_id in range(0,13):
        train_report.loc[labels[label_id], :] = [clf2_results['train_label'+str(label_id)+'_precision'][i],clf2_results['train_label'+str(label_id)+'_recall'][i],clf2_results['train_label'+str(label_id)+'_fscore'][i],clf2_results['train_label'+str(label_id)+'_support'][i]]
        test_report.loc[labels[label_id], :] = [clf2_results['test_label'+str(label_id)+'_precision'][i],clf2_results['test_label'+str(label_id)+'_recall'][i],clf2_results['test_label'+str(label_id)+'_fscore'][i],clf2_results['test_label'+str(label_id)+'_support'][i]]

    train_report.loc['Avg/Total', :] = [clf2_results['train_avg_precision'][i],clf2_results['train_avg_recall'][i],clf2_results['train_avg_fscore'][i],clf2_results['train_total_support'][i]]
    test_report.loc['Avg/Total', :] = [clf2_results['test_avg_precision'][i],clf2_results['test_avg_recall'][i],clf2_results['test_avg_fscore'][i],clf2_results['test_total_support'][i]]

    fold_index = pd.DataFrame(data=[{'Fold':'Fold '+str(i)}])
    fold_index.to_excel(writer,'RBS',startrow=datalength, index=False)
    datalength+=(len(fold_index)+2)
    train_report.to_excel(writer,'RBS',startrow=datalength)
    datalength+=(len(train_report)+2)
    test_report.to_excel(writer,'RBS',startrow=datalength)
    datalength+=(len(test_report)+2)

    result_dict['RBS_train_'+str(i)] = train_report
    result_dict['RBS_test_'+str(i)] = test_report

    train_report = train_report.astype(float).round(2)
    test_report = test_report.astype(float).round(2)

    print("\n------------------------- FOLD "+str(i)+": -------------------------")
    print("\nTraining Results:")
    print(train_report)
    print("\nTest Results:")
    print(test_report)

writer.save()


------------------------- FOLD 0: -------------------------

Training Results:
                                   Precision  Recall  F1-score  Support
Action on Issue                         1.00    1.00      1.00      48.0
Bug Reproduction                        1.00    1.00      1.00     196.0
Contribution and Commitment             1.00    1.00      1.00      66.0
Expected Behaviour                      1.00    1.00      1.00      99.0
Investigation and Exploration           1.00    1.00      1.00     301.0
Social Conversation                     1.00    1.00      1.00     638.0
Motivation                              1.00    1.00      1.00     230.0
Observed Bug Behaviour                  1.00    1.00      1.00     104.0
Potential New Issues and Requests       1.00    1.00      1.00     184.0
Solution Discussion                     1.00    1.00      1.00    1128.0
Task Progress                           0.99    0.99      0.99     100.0
Solution Usage                          1.00 