### IMPORTING REQUIRED LIBRARIES

1. DATA HANDLING

In [316]:
import pandas as pd
import numpy as np

2. DATA PREPROCESSING

In [317]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

3. MODEL DEVELOPMENT

In [318]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score, log_loss
from sklearn.metrics import average_precision_score

4. SAVING THE MODEL 

In [319]:
import joblib

### DATASET 

In [320]:
# Loading preprocessed dataset
file_path = "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/preprocessed_data.csv"
df = pd.read_csv(file_path)

In [321]:
# Previewing data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [322]:
# Shape of dataframe
row_count, column_count = df.shape
df.shape

(947, 2)

In [323]:
# Checking the data types
df.dtypes

tag         object
sentence    object
dtype: object

In [324]:
# Checking for null values
df.isnull().sum()

tag         0
sentence    0
dtype: int64

In [325]:
# Checking if there are any duplicate records
df[df.duplicated(keep=False)]

Unnamed: 0,tag,sentence


In [326]:
# Converting tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

In [327]:
# Vectorizing text/features
tfidf = TfidfVectorizer(analyzer='word', max_features=2000, ngram_range=(1,3)) 
X = tfidf.fit_transform(df['sentence'])
X

<947x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 48676 stored elements in Compressed Sparse Row format>

In [328]:
X

<947x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 48676 stored elements in Compressed Sparse Row format>

In [329]:
# Encoding tags
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [330]:
# Checking shape
X.shape, y.shape 

((947, 2000), (947, 3))

### MODELLING

In [331]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [332]:
# Instantiating models
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()
nb = MultinomialNB()

In [333]:

# Functions to calculate and print evaluation metrics
def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Accuracy: {:.2}".format(accuracy_score(y_test,y_pred)))
    print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))
    print("Recall Score: {:.2}".format(recall_score(y_test, y_pred, average = 'weighted')))
    print("F1 Score: {:.2}".format(f1_score(y_test, y_pred, average = 'weighted')))
    print("--------------------------")


In [334]:
# Training the models
for classifier in [sgd, lr, svc, nb]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  SGDClassifier
Accuracy: 0.75
Precision Score: 0.79
Recall Score: 0.81
F1 Score: 0.85
--------------------------
Clf:  LogisticRegression
Accuracy: 0.68
Precision Score: 0.76
Recall Score: 0.68
F1 Score: 0.79
--------------------------
Clf:  LinearSVC
Accuracy: 0.75
Precision Score: 0.8
Recall Score: 0.8
F1 Score: 0.85
--------------------------
Clf:  MultinomialNB
Accuracy: 0.65
Precision Score: 0.73
Recall Score: 0.65
F1 Score: 0.76
--------------------------




### PREDICTIONS

In [335]:
# Making predictions
x = ['Arizona may sublicense the licenses granted herein to its Affiliates and Third Parties in the ordinary course of business in support of its and its Affiliates’ business, but not for the independent use of Third Parties, and the Company may sublicense the licenses granted herein to Third Parties, its Subsidiaries, AWP, controlled Affiliates, or any holding company that is a direct or indirect parent of the Company in the ordinary course of business in support of its and its Subsidiaries’ or controlled Affiliates’ business, but not for the independent use of Third Parties (each such Affiliate, Third Party, AWP or Subsidiary, a “Sublicensee”)']
# x = ['In this Agreement, “we,” “us” and “our” refers to Pizza Fusion Holding, Inc., the franchisor. ']
# x = ['This Agreement may be signed in counterparts and shall be deemed one original instrument.']
# x = ['XIMAGE agrees to respond to any telephone call made, within 30 minutes.']
# x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
xt = tfidf.transform(x)
print("Prediction: ", clf.predict(xt))
print("Tag/s: ", multilabel.inverse_transform(clf.predict(xt)))

Prediction:  [[0 0 1]]
Tag/s:  [('prohibition',)]


In [336]:
x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
xt = tfidf.transform(x)
print("Prediction: ", clf.predict(xt))
print("Tag/s: ", multilabel.inverse_transform(clf.predict(xt)))

Prediction:  [[1 0 0]]
Tag/s:  [('obligation',)]


### SAVING THE MODELS

In [337]:
# Save vecotizer 
# joblib.dump(tfidf, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelVectorizer.pkl")

In [338]:
# Save binarizer 
# joblib.dump(multilabek, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelBinarizer_NonNN.pkl")

In [339]:
# Save model
# joblib.dump(clf, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelModel.pkl")