In [1]:
# import libraries
import pandas as pd
import numpy as np
import timeit

from statistics import median, mean

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer

pd.set_option('display.max_rows', 90000)
pd.set_option('display.max_columns', 50)

In [2]:
path = '/Users/nikhilsawal/OneDrive/machine_learning/data_science_case_studies/buildzoom/data/'

train_x = pd.read_table(path + 'train_data.csv')
test_x = pd.read_table(path + 'xtest_data.csv')
test_y = pd.read_csv(path + 'ytest_pred.csv')

## License Type

In [3]:
import re

def get_uniques(df, col_name):
    
    """Any instance of the license type may contain duplicate!! For example:
    the GENERAL CONTRACTOR LICENSE may appear twice, but in reality doesn't 
    add any value to our model and need to be removed."""
    
    uniques = []
    for i in df[col_name]:
        uniques += i
    return list(set(uniques))


def get_pattern(df):
    
    """
    This function identifies all the different possible appearances 
    for a license type. For example: GENERAL CONTRACTOR LICENSE has the 
    following appearances:
    >>> ['GENERAL CO', 'GENERAL C', 'GENERA',
        'GENERAL CONTRACTOR LICENSE', 'GENERAL ', 'GENERAL CONT', 
        'GENERAL CONTRA', 'GENERAL']
    """
    
    uniques = get_uniques(df, 'licensetype')
    unique = []
    for i in range(len(uniques)):
        
        pattern = re.compile('^'+uniques[i][:5]+'*')
        matches = []
        for index, license in enumerate(uniques):

            if pattern.search(license) is not None:
                matches.append(license)
        
        if matches not in unique:
            unique.append(matches)
        pass
    
    licenseList = []
    licenseDict = {}
    
    for licenses in unique:
        licenseList.append(licenses)
        licenseDict[max(licenses)] = licenses
    
    return licenseList
  

# Clean licensetype
def clean_license(inp_list, pattern):
    
    """
    This function takes a list of patterns generated by the get_pattern()
    function and replaces any unusual license type by the more general!!!
    For example: ['GENERAL CO', 'GENERAL C', 'GENERA',
                  'GENERAL CONTRACTOR LICENSE', 'GENERAL ', 
                  'GENERAL CONT', 'GENERAL CONTRA', 'GENERAL']
        
    will be replaces with 'GENERAL CONTRACTOR LICENSE'
    """
    
    temp_list = []
    for i in inp_list:
        if i == 'None':
            temp_list.append(i)
        else:
            for j in pattern:
                temp = []
                if i in j:
                    temp_list.append(max(j).lower().replace(" ", "_"))
                    break
    return temp_list



In [4]:
start = timeit.default_timer()
train_x.loc[:,'licensetype'] = train_x.loc[:,'licensetype'].fillna('None')
train_x.loc[:,'licensetype'] = train_x.loc[:,'licensetype'].apply(lambda x: x.split(', ')).apply(lambda x: list(set(x)))
pattern = get_pattern(train_x)
stop = timeit.default_timer()
print('Get Pattern: ', stop - start)


start = timeit.default_timer()
cleaned_license = [clean_license(item, pattern) for item in train_x['licensetype']]
cleaned_license = ['-'.join(sorted(i)) for i in cleaned_license]
train_x.loc[:,'licensetype'] = cleaned_license
stop = timeit.default_timer()
print('Clean License: ', stop - start)

Get Pattern:  0.21043539200000083
Clean License:  0.17293184199999878


## businessname

In [5]:
def get_businessname(data, n):
    """Set top N businessnames as factor"""
    temp = data['businessname'].value_counts().head(n).index.values
    top_n = [i.lower().replace(" ","_") if i in temp else 'Other' for i in data['businessname']]
    return top_n

In [6]:
start = timeit.default_timer()
train_x['businessname'].fillna('None', inplace=True)
train_x.loc[:,'businessname'] = get_businessname(train_x, 100)
stop = timeit.default_timer()
print('Business Name: ', stop - start)


Business Name:  0.5888640219999992


## description

In [8]:
# Remove stopwords, non alphabetic characters

def nltk_description(data):
    
    """
    This function takes in text data in the form of description
    which is first tokenized and later cleaned by removing all stopwords,
    removing special characters, stemming each character to its root form
    and returns a string.
    """
    
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    num_pattern = re.compile(r'\s*[\W0-9\s]\s*')

    sample = []

    for index, description in enumerate(data["description"]):
        
        words = word_tokenize(description)
        no_stops = [i for i in words if i.lower() not in stop_words]
        no_special_char = [ps.stem(num_pattern.sub("",i)) for i in no_stops if ps.stem(num_pattern.sub("",i)) != '']
        descrip = " ".join(i for i in no_special_char)
        sample.append(descrip)
    
    return sample


In [9]:
start = timeit.default_timer()
train_x.loc[:,'description'] = train_x.loc[:,'description'].fillna('None')
train_x.loc[:,'description'] = nltk_description(train_x)
stop = timeit.default_timer()
print('Description: ', stop - start)



Description:  45.767612867000004


## subtype

In [10]:
def encode_subtype(data):
    """Encode subtype using OneHotEncoding"""
    data.loc[:,'subtype'] = data.loc[:,'subtype'].fillna('None')
    z = data.loc[:,['subtype']].values
    y = OneHotEncoder().fit_transform(z).toarray()
    return y

In [11]:
start = timeit.default_timer()
train_x['subtype'] = train_x['subtype'].fillna('None')
train_x['subtype'] = [i.lower().replace(" ", "_") for i in train_x['subtype']]
stop = timeit.default_timer()
print('Subtype: ', stop - start)



Subtype:  0.04627532799999301


## Data Preparation

In [12]:
def data_preprocessing(data):

    # license type 
    data.loc[:,'licensetype'] = data.loc[:,'licensetype'].fillna('None')
    data.loc[:,'licensetype'] = data.loc[:,'licensetype'].apply(lambda x: x.split(', ')).apply(lambda x: list(set(x)))
      
    pattern = get_pattern(data)
    cleaned_license = [clean_license(item, pattern) for item in data['licensetype']]
    cleaned_license = ['-'.join(sorted(i)) for i in cleaned_license]
    data.loc[:,'licensetype'] = cleaned_license
    
    # Set top business names as factors
    data['businessname'].fillna('None', inplace=True)
    data.loc[:,'businessname'] = get_businessname(data, 100)
    
    # Set binary value for legal description
    data.loc[:,'legaldescription'] = data['legaldescription'].fillna('None')
    data.loc[:,'has_ld'] = [1 if i!='None' else 0 for i in data['legaldescription']]
    
    # tfidf for description
    data.loc[:,'description'] = data.loc[:,'description'].fillna('None')
    data.loc[:,'description'] = nltk_description(data)
    
    # Subtype
    data['subtype'] = data['subtype'].fillna('None')
    data['subtype'] = [i.lower().replace(" ", "_") for i in data['subtype']]

    # Job Value
    cleaned_job_value = data['job_value'].apply(lambda x: float(str(x).replace('$', '').replace(',','')))
    data.loc[:,'job_value'] = cleaned_job_value
    data.loc[:,'job_value'] = data['job_value'].fillna(0.0)
    
    return data.loc[:, ~data.columns.isin(['legaldescription'])]


    

In [13]:
start = timeit.default_timer()
data_preprocessing(train_x)
stop = timeit.default_timer()
print('Data preprocessing: ', stop - start)



Data preprocessing:  40.470503638


In [14]:
start = timeit.default_timer()
del(train_x)
del(test_x)
del(test_y)

train_x = pd.read_table(path + 'train_data.csv')
test_x = pd.read_table(path + 'xtest_data.csv')
test_y = pd.read_csv(path + 'ytest_pred.csv')

X = train_x.loc[:,~train_x.columns.isin(['type'])].copy()
y = train_x['type'].apply(lambda x: 1 if x=='ELECTRICAL' else 0) 
X_test = test_x.copy()
y_test = test_y

X = data_preprocessing(X)
X_test = data_preprocessing(X_test)

stop = timeit.default_timer()
print('Train - Test data prep: ', stop - start)



Train - Test data prep:  60.303732585999995


## Train ML model

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer


def machine_learning_prep(train_X, train_y, test_X, test_y):
    
    # Prep training data
    X = train_X.values
    y = train_y.values
    X_test = test_X.iloc[:25148,:].values
    y_test = test_y.values
    
    # Train Validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                      test_size=0.25, 
                                                      random_state=1, 
                                                      stratify=y)
    
    # Transform licensetype, businessname and subtype using OneHotEncoding
    column_trans = make_column_transformer((OneHotEncoder(sparse=False, handle_unknown='ignore'), [0, 1, 3]),
                                           remainder='passthrough')
    
    X_train = column_trans.fit_transform(X_train)
    X_val = column_trans.transform(X_val)
    X_test = column_trans.transform(X_test)
    
    
    # Transform Description using tf-idf
    tf = TfidfVectorizer(min_df=1, stop_words='english', lowercase=False)
    X_train[:,-3] = tf.fit_transform(X_train[:,-3]).toarray().sum(axis=1)
    X_val[:,-3] = tf.transform(X_val[:,-3]).toarray().sum(axis=1)
    X_test[:,-3] = tf.transform(X_test[:,-3]).toarray().sum(axis=1)
    
    # Training model with XGBoost
    classifier = XGBClassifier(use_label_encoder=False)
    model = classifier.fit(X_train, y_train)
    
    # Evaluating Model
    y_pred = classifier.predict(X_val)
    cm = confusion_matrix(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Evaluating model on test
    y_test_pred = classifier.predict(X_test)
    cm_test = confusion_matrix(y_test, y_test_pred)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    
    return cm, accuracy, cm_test, accuracy_test
    

# Baseline Model

In [21]:
start = timeit.default_timer()
cm, accuracy, cm_test, accuracy_test = machine_learning_prep(X, y, X_test, y_test)
print('\n')
print('Training Accuracy')
print(cm, '\n', accuracy, '\n')
print('Test Accuracy')
print(cm_test, '\n', accuracy_test)

stop = timeit.default_timer()
print('\n')
print('Baseline Model Training: ', stop - start)





Training Accuracy
[[17429   448]
 [  687  6475]] 
 0.9546707136866488 

Test Accuracy
[[14142  4076]
 [ 4091  2839]] 
 0.6752425640209957


Baseline Model Training:  0.0008582729933550581


# Tutorial

In [None]:
import sys
!{sys.executable} -m pip install xgboost