# Import Libraries

In [20]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier,RidgeClassifier,PassiveAggressiveClassifier
import pickle
from sklearn.metrics import precision_recall_fscore_support,accuracy_score, classification_report
from sklearn.calibration import CalibratedClassifierCV,_CalibratedClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time
import logging
from sklearn.decomposition import TruncatedSVD
import os
import psutil
import resource


# Define Constants

In [55]:
data_filepath = "it_dec18_mod.csv"

label_freq_threshold = 10

# requirement for new data

cols = ['Resolution','Subject','body','OwnerTeam']
training_cols = ['Resolution','Subject','body']
runtime_cols = ['Subject','body']
target_col = ['OwnerTeam']

#internal declaration

train_col_name = "train_text"
val_col_name = "val_text"

# Data Preprocessing

In [56]:
def remove_and_dedup_punctuation_numbers(text): 
    result = re.sub('<.*?>', '', text.replace("(AutoClosed)","").replace("\n","").replace("\r","")\
                    .replace("&nbsp;","").replace("***",""))
    result = result.replace('[\w\d\.-]+@[\w\d\.-]+',' ').replace(
    '\d+', ' ').replace('_+', ' ').replace('+', ' ').replace(
    '.', ' ').replace('^https?:\/\/.*[\r\n]*', '').replace('[^\w]',' ').lower()
    result = re.sub (r'([^a-zA-Z\s.?!])', '', result)
    result = re.sub(r"([" + re.escape(string.punctuation) + r"])\1+", r"\1", result)
    result = re.sub( '\s+', ' ', result ).strip()
    return result


def drop_nan(df,training_cols,target_col):
    print("method drop_nan started:\n")
    print("number of records before processing: ",df.__len__())
    df1 = df[training_cols].dropna(how="all")
    indices_1 = df1.index
    df2 = df[target_col].dropna(how="all")
    indices_2 = df2.index
    indices_final = list(set.intersection(set(indices_1),set(indices_2)))
    return_df = df.iloc[indices_final][training_cols+target_col]
    print("number of records after processing: ",return_df.__len__())
    print("NaN value Present? ",return_df[target_col+training_cols].isnull().values.any())
    return_df.fillna("", inplace=True)
    print("NaN value Present? ",return_df[target_col+training_cols].isnull().values.any())
    print("\nmethod drop_nan ended.\n\n")
    return return_df

def create_and_clean_train_test_target_column(df):
    print("method create_and_clean_train_test_target_column started:\n")
    df[train_col_name] = df[training_cols].apply(lambda x: " ".join(x), axis=1)
    df[train_col_name] = df[train_col_name].map(remove_and_dedup_punctuation_numbers)
    df[val_col_name] = df[runtime_cols].apply(lambda x: " ".join(x), axis=1)
    df[val_col_name] = df[val_col_name].map(remove_and_dedup_punctuation_numbers)
    print("\nmethod create_and_clean_train_test_target_column ended.\n\n")
    return df[[train_col_name,val_col_name]+target_col]

# Modeling

In [57]:
def get_tfidf_parameters():
    param =  {
        'tfidf__lowercase':(True,),
        'tfidf__stop_words':('english',),
        'tfidf__max_df': (0.90,0.85),
        'tfidf__min_df': (2,5),
        'tfidf__sublinear_tf': (True,),
        'tfidf__ngram_range': ((1,3),)
    }
    return param

def get_SGDClassifier_parameters():
    param = {
        'clf__alpha': (0.0001,),
        'clf__penalty': ('elasticnet',),
        'clf__loss': ('hinge',),
        'clf__max_iter': (20,)
    }
    return param

def get_RidgeClassifier_parameters():
    param = {
        'clf__tol': (1e-2,),
        'clf__solver': ('sag',)
    }
    return param


def get_LinearSVC_parameters():
    param = {
        'clf__penalty': ('l2',),
        'clf__loss': ('hinge',),
        'clf__tol': (1e-4,),
        'clf__max_iter': (10,),
        'clf__multi_class': ('ovr',),
        'clf__class_weight': ('balanced',)
    }
    return param


def get_RandomForestClassifier_parameters():
    param = {
        'clf__n_estimators': (90,),
        'clf__max_depth': (11,),
        'clf__min_samples_leaf': (8,)
    }
    return param

def get_LogisticRegression_parameters():
    param = {
        'clf__multi_class': ('ovr',),
    }
    return param

def get_PassiveAggressiveClassifier_parameters():
    param = {
        'clf__max_iter': (50,),
    }
    return param

def print_model_stat(grid_search,pipeline,parameters,clfname):
    print("Performing grid search for ",clfname," ...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)    
    print()
    print("Best score: %0.3f" % grid_search.best_score_)
    print()
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
def get_clf_and_parameter(clfname):
    if clfname == "SGDClassifier":
        clf = SGDClassifier()
        clf_param = get_SGDClassifier_parameters()
    elif clfname == "RidgeClassifier":
        clf = RidgeClassifier()
        clf_param = get_RidgeClassifier_parameters()
    elif clfname == "LinearSVC":
        clf = LinearSVC()
        clf_param = get_LinearSVC_parameters()
    elif clfname == "RandomForestClassifier":
        clf = RandomForestClassifier()
        clf_param = get_RandomForestClassifier_parameters()
    elif clfname == "LogisticRegression":
        clf = LogisticRegression()
        clf_param = get_LogisticRegression_parameters()
    elif clfname == "PassiveAggressiveClassifier":
        clf = PassiveAggressiveClassifier()
        clf_param = get_PassiveAggressiveClassifier_parameters()
    else:
        raise BaseException("Unknown Classifier.")
        
    return clf,clf_param
        
def train_gridsearch_classifier_pipeline(train_df,clfname,val_col_name,target_col,score = 'accuracy'):
    
    clf,clf_param = get_clf_and_parameter(clfname)
        
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', clf)])
    
    tfidf_param = get_tfidf_parameters()
    
    parameters = {**tfidf_param, **clf_param}
    
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring=score)
    X = train_df[val_col_name].as_matrix()
    Y = train_df[target_col].as_matrix().ravel()
    grid_search.fit(X, Y)
    
    print_model_stat(grid_search,pipeline,parameters,clfname)
    
    #https://github.com/scikit-learn/scikit-learn/issues/8710
    calibrated_pipeline = _CalibratedClassifier(base_estimator=grid_search.best_estimator_, method='isotonic')
    return calibrated_pipeline



class TeamPredictionModel(object):
    def __init__(self,best_model,best_acc,best_classifier_name):
        self.best_model = best_model
        self.best_cv_acc = best_acc
        self.best_classifier_name = best_classifier_name
    
    @classmethod
    def train(cls,df,target_col,train_col_name,val_col_name):
        target = df[target_col].as_matrix()
        train_df, test_df = train_test_split(df,test_size=0.2,stratify=target,random_state=0)
        
        classifier_names = [
                            "LogisticRegression",
                            "RandomForestClassifier",
                            "LinearSVC",
                            "SGDClassifier",
                            "RidgeClassifier",
                            "PassiveAggressiveClassifier"
                           ]
        classifiers = list()
        
        for classifier_name in classifier_names:
            classifier = train_gridsearch_classifier_pipeline(train_df,classifier_name,
                                                              val_col_name,target_col)
            classifiers.append(classifier)
                
        best_model = None
        best_cv_acc = -100
        best_classifier_name = None
        
        X = train_df[train_col_name].as_matrix()
        Y = train_df[target_col].as_matrix().ravel()
        
        for i,clf in enumerate(classifiers):
            
            clf.fit(X,Y)
            label_encoder = clf.label_encoder_
            Y_prob = clf.predict_proba(test_df[val_col_name])
            Y_index = np.argmax(Y_prob,axis=1)
            Y_index = Y_index.ravel()
            # https://github.com/scikit-learn/scikit-learn/issues/10449
            Y_pred = label_encoder.inverse_transform(Y_index)
            Y_test = test_df[target_col]
            cv_acc = accuracy_score(Y_test, Y_pred)
            if cv_acc > best_cv_acc:
                best_cv_acc = cv_acc
                best_model = clf
                best_classifier_name = classifier_names[i]
            print("Classifier Name : ",classifier_names[i])
            print("Validation Accuracy : ",cv_acc)
            
        X = df[train_col_name].as_matrix()
        Y = df[target_col].as_matrix().ravel()
            
        best_model.fit(X,Y)
        
        print("Best classifier Name : ",best_classifier_name)
        print("Best validation Accuracy : ",best_cv_acc)
            
        return TeamPredictionModel(best_model,best_cv_acc,best_classifier_name)


    
    def save(self,filepath):
        model_dict = {"model":self.best_model,
                      "validation_accuracy":self.best_cv_acc,
                      "model_name":self.best_classifier_name
                     }
        with open(filepath,"wb") as fp:
            pickle.dump(model_dict,fp,pickle.HIGHEST_PROTOCOL)
        

In [58]:
def preprocess_data(data_filepath):
    df = pd.read_csv(data_filepath,low_memory=False)
    truncated_df_null_removed = drop_nan(df,training_cols,target_col)
    df_train_test_target = create_and_clean_train_test_target_column(truncated_df_null_removed)
    df_cleaned = df_train_test_target.groupby(target_col).filter(lambda x: len(x) > label_freq_threshold)
    return df_cleaned
    
def train_and_save_model(df_cleaned,target_col,train_col_name,val_col_name):
    team_pred_model = TeamPredictionModel.train(df_cleaned,target_col,train_col_name,val_col_name)
    team_pred_model.save("./team_model.pkl")

In [59]:
def main():
    df_cleaned = preprocess_data(data_filepath)
    train_and_save_model(df_cleaned,target_col,train_col_name,val_col_name)
    

In [60]:
%%time

if __name__ == "__main__":
    main()

method drop_nan started:

number of records before processing:  31932
number of records after processing:  31739
NaN value Present?  True
NaN value Present?  False

method drop_nan ended.


method create_and_clean_train_test_target_column started:


method create_and_clean_train_test_target_column ended.


Performing grid search for  LogisticRegression  ...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__multi_class': ('ovr',),
 'tfidf__lowercase': (True,),
 'tfidf__max_df': (0.9, 0.85),
 'tfidf__min_df': (2, 5),
 'tfidf__ngram_range': ((1, 3),),
 'tfidf__stop_words': ('english',),
 'tfidf__sublinear_tf': (True,)}

Best score: 0.731

Best parameters set:
	clf__multi_class: 'ovr'
	tfidf__lowercase: True
	tfidf__max_df: 0.9
	tfidf__min_df: 5
	tfidf__ngram_range: (1, 3)
	tfidf__stop_words: 'english'
	tfidf__sublinear_tf: True
Performing grid search for  RandomForestClassifier  ...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__max_depth': (11,),
 'clf__min_samples_leaf': (8,),
 'clf__n_estim

  if diff:


Classifier Name :  LogisticRegression
Validation Accuracy :  0.7661620939766635


  if diff:


Classifier Name :  RandomForestClassifier
Validation Accuracy :  0.6980447808262378


  if diff:


Classifier Name :  LinearSVC
Validation Accuracy :  0.7625354777672658


  if diff:


Classifier Name :  SGDClassifier
Validation Accuracy :  0.7737306843267108


  if diff:


Classifier Name :  RidgeClassifier
Validation Accuracy :  0.7732576474298328


  if diff:


Classifier Name :  PassiveAggressiveClassifier
Validation Accuracy :  0.7385682749921161
Best classifier Name :  SGDClassifier
Best validation Accuracy :  0.7737306843267108
CPU times: user 1min 28s, sys: 2.97 s, total: 1min 31s
Wall time: 4min 48s


In [61]:
model_dict = joblib.load("team_model.pkl")

In [62]:
clf = model_dict["model"]
df_cleaned = preprocess_data(data_filepath)
target = df_cleaned[target_col].as_matrix()
train_df, test_df = train_test_split(df_cleaned,test_size=0.2,stratify=target,random_state=0)
label_encoder = clf.label_encoder_
Y_prob = clf.predict_proba(test_df[val_col_name])
Y_index = np.argmax(Y_prob,axis=1)
Y_index = Y_index.ravel()
# https://github.com/scikit-learn/scikit-learn/issues/10449
Y_pred = label_encoder.inverse_transform(Y_index)
Y_test = test_df[target_col]

method drop_nan started:

number of records before processing:  31932
number of records after processing:  31739
NaN value Present?  True
NaN value Present?  False

method drop_nan ended.


method create_and_clean_train_test_target_column started:


method create_and_clean_train_test_target_column ended.




  if diff:


In [63]:
accuracy_score(Y_test, Y_pred)

0.771838536739199

In [64]:
classification = classification_report(Y_test, Y_pred)
print(classification)

                                 precision    recall  f1-score   support

        Application Development       0.73      0.77      0.75       639
          Business Applications       0.77      0.73      0.75       438
          Business Intelligence       0.65      0.37      0.47        35
               Business Support       0.15      0.07      0.09        45
                         CS-Ops       0.90      0.66      0.76        29
                    DevOps/100%       0.72      0.61      0.66       133
        Facilities / Operations       0.00      0.00      0.00        10
                             IT       0.00      0.00      0.00        24
         Infrastructure Support       0.62      0.50      0.55      1010
Order Management: SKU Approvals       1.00      0.17      0.29         6
                    SAP-WorkDay       0.57      0.71      0.63       122
              SalesOps Americas       0.91      0.54      0.67        54
                       Security       0.96      0.

  'precision', 'predicted', average, warn_for)


In [65]:
conf = pd.crosstab(Y_test.OwnerTeam, Y_pred)

In [53]:
conf

col_0,Application Development,Business Applications,Business Intelligence,Business Support,CS-Ops,DevOps/100%,Facilities / Operations,IT,Infrastructure Support,Order Management: SKU Approvals,SAP-WorkDay,SalesOps Americas,Security,Service Desk,Shavlik,Web Development,Web Marketing
OwnerTeam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Application Development,483,34,0,2,0,0,0,0,30,0,9,0,0,80,0,1,0
Business Applications,32,329,0,4,1,0,0,0,2,0,19,3,0,48,0,0,0
Business Intelligence,3,2,13,0,0,0,0,0,4,0,7,0,0,6,0,0,0
Business Support,1,12,0,6,0,0,0,0,6,0,0,0,0,20,0,0,0
CS-Ops,0,4,1,0,20,0,0,0,0,0,0,0,0,4,0,0,0
DevOps/100%,1,0,0,0,0,79,0,0,31,0,0,0,1,21,0,0,0
Facilities / Operations,0,0,0,0,0,0,1,0,1,0,0,0,0,8,0,0,0
IT,1,0,0,0,0,2,0,0,1,0,0,0,0,20,0,0,0
Infrastructure Support,10,4,0,0,0,2,0,0,575,0,3,1,4,410,1,0,0
Order Management: SKU Approvals,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [54]:
train_df[train_df["OwnerTeam"] == "IT"]

Unnamed: 0,train_text,val_text,OwnerTeam
1389,assigned a resharper license for bartosz sent ...,request for resharper license id like to reque...,IT
9104,barbie discovered the issue automatically clos...,support line south africa our partner in sa is...,IT
1201,closed at customer request automatically close...,folder corrupted by osiris virus we have a fil...,IT
7368,completed automatically closed network cable l...,network cable lan looks like i have a dodgy ne...,IT
7080,hey andyif the customer leaves a voicemail the...,fw voicemail from message id i received this e...,IT
4291,your key card has been disabled when you are b...,disable building access card i lost my access ...,IT
4511,changed the email address to it incidents in q...,password manager template update could you ple...,IT
6860,hi oliviaalans details have been added to the ...,galway office mailing list can you please add ...,IT
8289,new door access card created automatically clo...,new swipe can i please request a new swipe min...,IT
15204,crash dump sugested graphics drivers updated i...,bsod on dock since friday john keeps getting a...,IT
