# Import Libraries

In [31]:
import numpy as np
import pandas as pd
import re
import string 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier,RidgeClassifier,PassiveAggressiveClassifier
import pickle
from sklearn.metrics import precision_recall_fscore_support,accuracy_score, classification_report
from sklearn.calibration import CalibratedClassifierCV,_CalibratedClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time
import logging
from sklearn.decomposition import TruncatedSVD



# Define Constants

In [91]:
data_filepath = "it_dec18_mod.csv"
label_freq_threshold = 10

# requirement for new data

cols = ['Resolution','Subject','body','ActualService']
training_cols = ['Resolution','Subject','body']
runtime_cols = ['Subject','body']
target_col = ['ActualService']

#internal declaration

train_col_name = "train_text"
val_col_name = "val_text"

# Data Preprocessing

In [92]:
def remove_and_dedup_punctuation_numbers(text): 
    result = re.sub('<.*?>', '', text.replace("(AutoClosed)","").replace("\n","").replace("\r","")\
                    .replace("&nbsp;","").replace("***",""))
    result = result.replace('[\w\d\.-]+@[\w\d\.-]+',' ').replace(
    '\d+', ' ').replace('_+', ' ').replace('+', ' ').replace(
    '.', ' ').replace('^https?:\/\/.*[\r\n]*', '').replace('[^\w]',' ').lower()
    result = re.sub (r'([^a-zA-Z\s.?!])', '', result)
    result = re.sub(r"([" + re.escape(string.punctuation) + r"])\1+", r"\1", result)
    result = re.sub( '\s+', ' ', result ).strip()
    return result


def drop_nan(df,training_cols,target_col):
    print("method drop_nan started:\n")
    print("number of records before processing: ",df.__len__())
    df1 = df[training_cols].dropna(how="all")
    indices_1 = df1.index
    df2 = df[target_col].dropna(how="all")
    indices_2 = df2.index
    indices_final = list(set.intersection(set(indices_1),set(indices_2)))
    return_df = df.iloc[indices_final][training_cols+target_col]
    print("number of records after processing: ",return_df.__len__())
    print("NaN value Present? ",return_df[target_col+training_cols].isnull().values.any())
    return_df.fillna("", inplace=True)
    print("NaN value Present? ",return_df[target_col+training_cols].isnull().values.any())
    print("\nmethod drop_nan ended.\n\n")
    return return_df

def create_and_clean_train_test_target_column(df):
    print("method create_and_clean_train_test_target_column started:\n")
    df[train_col_name] = df[training_cols].apply(lambda x: " ".join(x), axis=1)
    df[train_col_name] = df[train_col_name].map(remove_and_dedup_punctuation_numbers)
    df[val_col_name] = df[runtime_cols].apply(lambda x: " ".join(x), axis=1)
    df[val_col_name] = df[val_col_name].map(remove_and_dedup_punctuation_numbers)
    print("\nmethod create_and_clean_train_test_target_column ended.\n\n")
    return df[[train_col_name,val_col_name]+target_col]

# Modeling

In [93]:

def encode_decision(y):
    encoder= LabelEncoder()
    y = y.ravel()
    encoder.fit(y)
    y_encoded=encoder.transform(y)
    return y_encoded,encoder


def get_tfidf_parameters():
    param =  {
        'tfidf__lowercase':(True,),
        'tfidf__stop_words':('english',),
        'tfidf__max_df': (0.90,0.85),
        'tfidf__min_df': (2,5),
        'tfidf__sublinear_tf': (True,),
        'tfidf__ngram_range': ((1,3),)
    }
    return param

def get_SGDClassifier_parameters():
    param = {
        'clf__alpha': (0.0001,),
        'clf__penalty': ('elasticnet',),
        'clf__loss': ('hinge',),
        'clf__max_iter': (20,)
    }
    return param

def get_RidgeClassifier_parameters():
    param = {
        'clf__tol': (1e-2,),
        'clf__solver': ('sag',)
    }
    return param


def get_LinearSVC_parameters():
    param = {
        'clf__penalty': ('l2',),
        'clf__loss': ('hinge',),
        'clf__tol': (1e-4,),
        'clf__max_iter': (10,),
        'clf__multi_class': ('ovr',),
        'clf__class_weight': ('balanced',)
    }
    return param


def get_RandomForestClassifier_parameters():
    param = {
        'clf__n_estimators': (90,),
        'clf__max_depth': (11,),
        'clf__min_samples_leaf': (8,)
    }
    return param

def get_LogisticRegression_parameters():
    param = {
        'clf__multi_class': ('ovr',),
    }
    return param

def get_PassiveAggressiveClassifier_parameters():
    param = {
        'clf__max_iter': (50,),
    }
    return param

def print_model_stat(grid_search,pipeline,parameters,clfname):
    print("Performing grid search for ",clfname," ...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)    
    print()
    print("Best score: %0.3f" % grid_search.best_score_)
    print()
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
def get_clf_and_parameter(clfname):
    if clfname == "SGDClassifier":
        clf = SGDClassifier()
        clf_param = get_SGDClassifier_parameters()
    elif clfname == "RidgeClassifier":
        clf = RidgeClassifier()
        clf_param = get_RidgeClassifier_parameters()
    elif clfname == "LinearSVC":
        clf = LinearSVC()
        clf_param = get_LinearSVC_parameters()
    elif clfname == "RandomForestClassifier":
        clf = RandomForestClassifier()
        clf_param = get_RandomForestClassifier_parameters()
    elif clfname == "LogisticRegression":
        clf = LogisticRegression()
        clf_param = get_LogisticRegression_parameters()
    elif clfname == "PassiveAggressiveClassifier":
        clf = PassiveAggressiveClassifier()
        clf_param = get_PassiveAggressiveClassifier_parameters()
    else:
        raise BaseException("Unknown Classifier.")
        
    return clf,clf_param
        
def train_gridsearch_classifier_pipeline(train_df,clfname,val_col_name,target_col,score = 'accuracy'):
    
    clf,clf_param = get_clf_and_parameter(clfname)
        
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', clf)])
    
    tfidf_param = get_tfidf_parameters()
    
    parameters = {**tfidf_param, **clf_param}
    
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring=score)
    X = train_df[val_col_name].as_matrix()
    Y = train_df[target_col].as_matrix().ravel()
    grid_search.fit(X, Y)
    
    print_model_stat(grid_search,pipeline,parameters,clfname)
    
    #https://github.com/scikit-learn/scikit-learn/issues/8710
    calibrated_pipeline = _CalibratedClassifier(base_estimator=grid_search.best_estimator_, method='isotonic')
    return calibrated_pipeline



class ServicePredictionModel(object):
    def __init__(self,best_model,best_acc,best_classifier_name):
        self.best_model = best_model
        self.best_cv_acc = best_acc
        self.best_classifier_name = best_classifier_name
    
    @classmethod
    def train(cls,df,target_col,train_col_name,val_col_name):
        target = df[target_col].as_matrix()
        train_df, test_df = train_test_split(df,test_size=0.2,stratify=target,random_state=0)
        
        classifier_names = [
                            "LogisticRegression",
                            "RandomForestClassifier",
                            "LinearSVC",
                            "SGDClassifier",
                            "RidgeClassifier",
                            "PassiveAggressiveClassifier"
                           ]
        classifiers = list()
        
        for classifier_name in classifier_names:
            classifier = train_gridsearch_classifier_pipeline(train_df,classifier_name,
                                                              val_col_name,target_col)
            classifiers.append(classifier)
                
        best_model = None
        best_cv_acc = -100
        best_classifier_name = None
        
        X = train_df[train_col_name].as_matrix()
        Y = train_df[target_col].as_matrix().ravel()
        
        for i,clf in enumerate(classifiers):
            
            clf.fit(X,Y)
            label_encoder = clf.label_encoder_
            Y_prob = clf.predict_proba(test_df[val_col_name])
            Y_index = np.argmax(Y_prob,axis=1)
            Y_index = Y_index.ravel()
            # https://github.com/scikit-learn/scikit-learn/issues/10449
            Y_pred = label_encoder.inverse_transform(Y_index)
            Y_test = test_df[target_col]
            cv_acc = accuracy_score(Y_test, Y_pred)
            if cv_acc > best_cv_acc:
                best_cv_acc = cv_acc
                best_model = clf
                best_classifier_name = classifier_names[i]
            print("Classifier Name : ",classifier_names[i])
            print("Validation Accuracy : ",cv_acc)
            
        X = df[train_col_name].as_matrix()
        Y = df[target_col].as_matrix().ravel()
            
        best_model.fit(X,Y)
        
        print("Best classifier Name : ",best_classifier_name)
        print("Best validation Accuracy : ",best_cv_acc)
            
        return ServicePredictionModel(best_model,best_cv_acc,best_classifier_name)


    
    def save(self,filepath):
        model_dict = {"model":self.best_model,
                      "validation_accuracy":self.best_cv_acc,
                      "model_name":self.best_classifier_name
                     }
        with open(filepath,"wb") as fp:
            pickle.dump(model_dict,fp,pickle.HIGHEST_PROTOCOL)
        

In [94]:
def preprocess_data(data_filepath):
    df = pd.read_csv(data_filepath,low_memory=False)
    truncated_df_null_removed = drop_nan(df,training_cols,target_col)
    df_train_test_target = create_and_clean_train_test_target_column(truncated_df_null_removed)
    df_cleaned = df_train_test_target.groupby(target_col).filter(lambda x: len(x) > label_freq_threshold)
    return df_cleaned
    
def train_and_save_model(df_cleaned,target_col,train_col_name,val_col_name):
    service_pred_model = ServicePredictionModel.train(df_cleaned,target_col,train_col_name,val_col_name)
    service_pred_model.save("./service_model.pkl")

In [95]:
def main():
    df_cleaned = preprocess_data(data_filepath)
    train_and_save_model(df_cleaned,target_col,train_col_name,val_col_name)

In [96]:
%%time

if __name__ == "__main__":
    main()

method drop_nan started:

number of records before processing:  31932
number of records after processing:  31739
NaN value Present?  True
NaN value Present?  False

method drop_nan ended.


method create_and_clean_train_test_target_column started:


method create_and_clean_train_test_target_column ended.


Performing grid search for  LogisticRegression  ...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__multi_class': ('ovr',),
 'tfidf__lowercase': (True,),
 'tfidf__max_df': (0.9, 0.85),
 'tfidf__min_df': (2, 5),
 'tfidf__ngram_range': ((1, 3),),
 'tfidf__stop_words': ('english',),
 'tfidf__sublinear_tf': (True,)}

Best score: 0.684

Best parameters set:
	clf__multi_class: 'ovr'
	tfidf__lowercase: True
	tfidf__max_df: 0.9
	tfidf__min_df: 5
	tfidf__ngram_range: (1, 3)
	tfidf__stop_words: 'english'
	tfidf__sublinear_tf: True
Performing grid search for  RandomForestClassifier  ...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__max_depth': (11,),
 'clf__min_samples_leaf': (8,),
 'clf__n_estim

  if diff:


Classifier Name :  LogisticRegression
Validation Accuracy :  0.6853808547547705


  if diff:


Classifier Name :  RandomForestClassifier
Validation Accuracy :  0.6760763286547863


  if diff:


Classifier Name :  LinearSVC
Validation Accuracy :  0.673079955842927


  if diff:


Classifier Name :  SGDClassifier
Validation Accuracy :  0.6869578930768018


  if diff:


Classifier Name :  RidgeClassifier
Validation Accuracy :  0.6830152972717237


  if diff:


Classifier Name :  PassiveAggressiveClassifier
Validation Accuracy :  0.642170004731115
Best classifier Name :  SGDClassifier
Best validation Accuracy :  0.6869578930768018
CPU times: user 1min 40s, sys: 3.02 s, total: 1min 43s
Wall time: 5min 28s


In [97]:
model_dict = joblib.load("service_model.pkl")

In [98]:
clf = model_dict["model"]
df_cleaned = preprocess_data(data_filepath)
target = df_cleaned[target_col].as_matrix()
train_df, test_df = train_test_split(df_cleaned,test_size=0.2,stratify=target,random_state=0)
label_encoder = clf.label_encoder_
Y_prob = clf.predict_proba(test_df[val_col_name])
Y_index = np.argmax(Y_prob,axis=1)
Y_index = Y_index.ravel()
# https://github.com/scikit-learn/scikit-learn/issues/10449
Y_pred = label_encoder.inverse_transform(Y_index)
Y_test = test_df[target_col]


method drop_nan started:

number of records before processing:  31932
number of records after processing:  31739
NaN value Present?  True
NaN value Present?  False

method drop_nan ended.


method create_and_clean_train_test_target_column started:


method create_and_clean_train_test_target_column ended.




  if diff:


In [99]:
accuracy_score(Y_test, Y_pred)

0.6931083425327236

In [100]:
conf = pd.crosstab(Y_test.ActualService, Y_pred)
classification = classification_report(Y_test, Y_pred)

  'precision', 'predicted', average, warn_for)


In [101]:
print(classification)

                                 precision    recall  f1-score   support

Application Development Service       0.59      0.61      0.60       585
             Audio Conferencing       0.00      0.00      0.00         2
                      BitLocker       0.36      0.24      0.29        17
     Business Analytics Service       0.45      0.35      0.39        37
        Communications Services       0.15      0.06      0.08        72
                   Data Service       0.00      0.00      0.00        16
                Desktop Service       0.50      0.02      0.04       171
               Domain Migration       0.73      0.63      0.68        35
                  Email Service       0.40      0.06      0.11       124
        Employee Administration       0.63      0.42      0.50       100
 Enterprise Application Service       0.00      0.00      0.00        70
          Facilities Management       0.50      0.17      0.25        12
              Financial Service       0.00      0.

In [102]:
conf

col_0,Application Development Service,BitLocker,Business Analytics Service,Communications Services,Desktop Service,Domain Migration,Email Service,Employee Administration,Enterprise Application Service,Facilities Management,...,Mobile Communication,Network Service,OKTA,OSUI,Printing Service,Sales and Marketing,Server Administration,Service Desk,Web Conferencing,WebEx
ActualService,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application Development Service,355,0,7,1,0,2,1,1,0,0,...,0,1,0,1,0,2,5,205,0,0
Audio Conferencing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
BitLocker,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,13,0,0
Business Analytics Service,14,0,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,0
Communications Services,3,0,0,4,0,0,0,0,0,0,...,0,1,0,0,0,0,1,63,0,0
Data Service,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,11,0,0
Desktop Service,3,0,0,1,4,0,1,0,0,0,...,0,0,0,0,0,0,1,161,0,0
Domain Migration,0,0,0,0,0,22,0,0,0,0,...,0,0,0,0,0,0,3,10,0,0
Email Service,4,0,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,1,110,0,0
Employee Administration,14,0,0,0,0,0,0,42,0,0,...,0,0,0,0,0,0,1,42,0,0


In [103]:
train_df[train_df["ActualService"] == "Desktop Service"]

Unnamed: 0,train_text,val_text,ActualService
15661,deleted history automatically closed cannot lo...,cannot login to myheatconnect com i get an sso...,Desktop Service
26111,in dsm i see you have this laptop cal and i am...,my admin username and password for my machine ...,Desktop Service
16087,called korinna and all is working again after ...,fw wifi functionality? it seems dsm disabled a...,Desktop Service
4880,i am closing this ticket as you indicated yest...,onedrive migration in the instructions you sen...,Desktop Service
18720,conformed working closing case on andys approv...,bios password for my machine is it possible fo...,Desktop Service
1567,issue resolved since second restart and furthe...,start menu not working installed some windows ...,Desktop Service
17466,skype is working for michael automatically clo...,i cannot log in or get skpe to work besides sk...,Desktop Service
13098,jason can now logon without the sfb prompting ...,unable to log in to lync im unable to login in...,Desktop Service
11251,signed out and signed back in to office accoun...,office says its not activated some functionali...,Desktop Service
17514,issues confirmed caused by the latest esr rele...,machine performance the machine was upgraded w...,Desktop Service


In [215]:
old_model = pd.read_csv("it_dec18.csv")
old_model.shape

  interactivity=interactivity, compiler=compiler, result=result)


(31755, 132)

In [209]:
old_model = old_model[~(pd.isnull(old_model["DIUpdateServicePrediction"]))]
old_model["DIUpdateServicePrediction"] = old_model["DIUpdateServicePrediction"].str.strip()
print(old_model.shape)
# old_model["DIUpdateServicePrediction"].value_counts()

e = pd.crosstab(old_model["ActualService"],old_model["DIUpdateServicePrediction"], margins=True)
e.to_csv("temp1.csv")

a = classification_report(old_model["ActualService"],old_model["DIUpdateServicePrediction"])
print(a)

(5562, 132)

In [212]:
old_service = old_model["DIUpdateServicePrediction"].unique()

new_service = old_model["ActualService"].unique()

np.setdiff1d(old_model["ActualService"], old_model["DIUpdateServicePrediction"])

In [217]:
old_model = old_model[~(pd.isnull(old_model["DIUpdateTeamPrediction"]))]
old_model["DIUpdateTeamPrediction"] = old_model["DIUpdateTeamPrediction"].str.strip()
print(old_model.shape)

e = pd.crosstab(old_model["OwnerTeam"],old_model["DIUpdateTeamPrediction"], margins=True)
e.to_csv("Team.csv")

a = classification_report(old_model["OwnerTeam"],old_model["DIUpdateTeamPrediction"])
print(a)

(5562, 132)
                                     precision    recall  f1-score   support

            Application Development       0.47      0.59      0.52       201
              Business Applications       0.62      0.79      0.70       549
              Business Intelligence       0.33      0.47      0.39        15
                   Business Support       0.09      0.12      0.10        33
                             CS-Ops       0.00      0.00      0.00       146
                          Cloud Ops       0.00      0.00      0.00         1
                        DevOps/100%       0.00      0.00      0.00         0
            Facilities / Operations       1.00      0.17      0.29        12
             Finance: SKU Approvals       0.00      0.00      0.00         2
                      Global DevOps       1.00      0.01      0.01       307
             Infrastructure Support       0.42      0.56      0.48       674
Opportunity Team - Customer Success       0.00      0.00      0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [220]:
old_service = old_model["DIUpdateTeamPrediction"].unique()

new_service = old_model["OwnerTeam"].unique()

print(np.setdiff1d(old_model["OwnerTeam"], old_model["DIUpdateTeamPrediction"]))
print( np.setdiff1d(old_model["DIUpdateTeamPrediction"],old_model["OwnerTeam"]))

['CS-Ops' 'Cloud Ops' 'Finance: SKU Approvals'
 'Opportunity Team - Customer Success' 'Order Management: SKU Approvals'
 'Pricing Committee' 'Professional Services Americas' 'SalesOps Americas'
 'SalesOps Global' 'SalesOps InsideSales' 'Shavlik']
['DevOps/100%']


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
for label in dataset.columns:
    dataset[label] = LabelEncoder().fit(dataset[label]).transform(dataset[label])
    
X = dataset.drop(['target'],axis=1)
Y = dataset['target']
#model = DecisionTreeClassifier(criterion='entropy',max_depth=1)
#AdaBoost = AdaBoostClassifier(base_estimator= model,n_estimators=400,learning_rate=1)
AdaBoost = AdaBoostClassifier(n_estimators=400,learning_rate=1,algorithm='SAMME')
AdaBoost.fit(X,Y)
prediction = AdaBoost.score(X,Y)
print('The accuracy is: ',prediction*100,'%')