In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier,RidgeClassifier,PassiveAggressiveClassifier
import pickle
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib



In [2]:
data_filepath = "./finalout_less.csv"
label_freq_threshold = 10
cols = ['Resolution','Subject',
       'Symptom','ActualService',
       'OwnerTeam']

In [3]:
def remove_and_dedup_punctuation_numbers(text): 
    result = text.replace('[\w\d\.-]+@[\w\d\.-]+',' ').replace(
    '\d+', ' ').replace('_+', ' ').replace('+', ' ').replace(
    '.', ' ').replace('^https?:\/\/.*[\r\n]*', '').replace('[^\w]',' ').lower()
    result = re.sub (r'([^a-zA-Z\s.?!])', '', result)
    result = re.sub(r"([" + re.escape(string.punctuation) + r"])\1+", r"\1", result)
    result = re.sub( '\s+', ' ', result ).strip()
    return result

def clean_text(row):
    sub = row['Subject']
    sym = row['Symptom']
    res = row['Resolution']
    if row['Subject'] is None:
        sub = ''
    if row['Symptom'] is None:
        sym = ''
    if row['Resolution'] is None:
        res = ''
    train_text = sub+" "+sym+" "+res
    text_clean = remove_and_dedup_punctuation_numbers(train_text)
    row["Text_Train_Cleaned"] = text_clean
    
    test_text = sub+" "+sym
    text_clean = remove_and_dedup_punctuation_numbers(test_text)
    row["Text_Test_Cleaned"] = text_clean
    
    return row

def encode_decision(y):
    encoder= LabelEncoder()
    encoder.fit(y)
    y_encoded=encoder.transform(y)
    return y_encoded,encoder

In [4]:
class DataPreprocessor(object):
    def __init__(self):
        pass
    
    @classmethod
    def clean_df(cls,df):
        truncated_df_null_removed = df[cols].dropna()
        #print(len(truncated_df_null_removed))
        df_cleaned = truncated_df_null_removed.apply(clean_text,1)[["Text_Train_Cleaned","Text_Test_Cleaned","OwnerTeam","ActualService"]]
        #print(len(df_cleaned))
        training_dataframe = df_cleaned.groupby(["ActualService","OwnerTeam"]).filter(lambda x: len(x) > label_freq_threshold)
        #print(len(training_dataframe))
        return training_dataframe
      
    @classmethod
    def create_train_test_df(cls,df):
        y = df[["ActualService","OwnerTeam"]].as_matrix()
        train_dataframe, test_dataframe = train_test_split(df,test_size=0.2,stratify=y,random_state=0)
        return train_dataframe,test_dataframe

In [5]:
class TeamPredictionModel(object):
    def __init__(self,best_model,best_acc,vectorizer,encoder):
        self.best_model = best_model
        self.best_acc = best_acc
        self.vectorizer = vectorizer
        self.encoder = encoder
    
    @classmethod
    def train(cls,train_dataframe,test_dataframe,target_column):
        vectorizer = TfidfVectorizer( lowercase=True, sublinear_tf=True,ngram_range=(1,3),min_df=2,max_df=0.95,
                              stop_words='english')

        X_train = vectorizer.fit_transform(train_dataframe["Text_Train_Cleaned"])
        print(len(vectorizer.get_feature_names()) )
        print(X_train.shape)
        
        train_label = train_dataframe[target_column].as_matrix()
        Y_train,encoder = encode_decision(train_label)

        X_test = vectorizer.transform(test_dataframe["Text_Test_Cleaned"])
        print(X_test.shape)

        test_label = test_dataframe[target_column].as_matrix()
        Y_test = encoder.transform(test_label)

        clf_LR = LogisticRegression(random_state=0,multi_class='ovr')
        clf_RF = RandomForestClassifier(n_estimators=90, random_state=0, n_jobs=-1,max_depth=11,min_samples_leaf=8)
        clf_SVM = LinearSVC(penalty='l2', tol=1e-4,loss='hinge',max_iter=10,multi_class="ovr",class_weight='balanced')
        clf_SGD_l2 = SGDClassifier(loss='hinge',alpha=.0001, max_iter=10,  penalty='l2')
        clf_SGD_elastic = SGDClassifier(alpha=.0001, max_iter=10,n_jobs=-1, penalty="elasticnet")
        clf_ridge = RidgeClassifier(tol=1e-2, solver="sag",random_state=0)
        clf_pass = PassiveAggressiveClassifier(max_iter=50)
        classifiers = [clf_LR,clf_RF,clf_SVM,clf_SGD_l2,clf_SGD_elastic,clf_ridge,clf_pass]
        
        best_model = None
        best_acc = -100
        for clf in classifiers:
            model,acc = cls.model_stat(clf,X_train,Y_train,X_test,Y_test)
            if acc > best_acc:
                best_acc = acc
                best_model = model
            print(acc)
        return TeamPredictionModel(best_model,best_acc,vectorizer,encoder)
    
    @classmethod
    def model_stat(cls,model,X_train,Y_train,X_test,Y_test):
        clf_isotonic = CalibratedClassifierCV(model, cv=3, method='isotonic')
        clf_isotonic.fit(X_train, Y_train)
        Y_pred = clf_isotonic.predict(X_test)
        return clf_isotonic,accuracy_score(Y_test, Y_pred)
    
    
    def save(self,filepath):
        model_dict = {"model":self.best_model,
                      "accuracy":self.best_acc,
                      "vectorizer":self.vectorizer,
                      "encoder":self.encoder}
        with open(filepath,"wb") as fp:
            pickle.dump(model_dict,fp,pickle.HIGHEST_PROTOCOL)
        

In [6]:
input_df = pd.read_csv(data_filepath)
c_df = DataPreprocessor.clean_df(input_df)
train,test = DataPreprocessor.create_train_test_df(c_df)
service_pred_model = TeamPredictionModel.train(train,test,"ActualService")

177724
(14557, 177724)
(3640, 177724)
0.6510989010989011
0.6041208791208791
0.6458791208791209
0.657967032967033
0.6574175824175824
0.6576923076923077
0.65


In [7]:
service_pred_model.save("./service_model.pkl")

In [8]:
model_dict = joblib.load("./service_model.pkl")

In [9]:
def score(model_dict,subject,symptom):
    text = subject + " " + symptom
    text_cleaned = [remove_and_dedup_punctuation_numbers(text)]
    text_vectorized = model_dict["vectorizer"].transform(text_cleaned)
    result = model_dict["model"].predict_proba(text_vectorized)
    label_index = np.argmax(result)
    label = model_dict["encoder"].inverse_transform(label_index)
    confidence = result[0][label_index]
    response = {"service":label,"confidence":confidence}
    return response

In [10]:
subject = "Request for Former Lumension reporting access"
symptom = "ann.haehn@heatsoftware.com www.HEATsoftware.com (AutoClosed)te instance as well as access to the SSRS report cloud site for Lumension reports. Thank you"


In [11]:
score(model_dict,subject,symptom)

  if diff:


{'confidence': 0.2998423027049654, 'service': 'Service Desk'}