In [2]:
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
df = pd.read_csv('data.csv')


# Explatory data analysis

In [3]:
print(df)
print(df.head())

                  attribute  \
0                 nick name   
1                 user name   
2                 full name   
3                        id   
4                     email   
..                      ...   
155    prescription history   
156          discharge date   
157            is alcoholic   
158               is smoker   
159  primary care physician   

                                               classes  user  music  employee  \
0    ['user', 'music','employee','book','website','...     1      0         1   
1    ['user', 'music','employee','book','website','...     1      0         0   
2    ['user', 'music','employee','book','website','...     1      0         1   
3    ['user', 'music','employee','book','website','...     1      0         1   
4    ['user', 'music','employee','book','website','...     1      0         1   
..                                                 ...   ...    ...       ...   
155  ['user', 'music','employee','book','website','...     0

In [4]:
df.isna().any()

attribute        False
classes          False
user             False
music            False
employee         False
book             False
website          False
credit card      False
shopping cart    False
patient          False
dtype: bool

In [5]:
df.dtypes

attribute        object
classes          object
user              int64
music             int64
employee          int64
book              int64
website           int64
credit card       int64
shopping cart     int64
patient           int64
dtype: object

In [6]:
df['user'] = df['user'].astype(float)
df['music'] = df['music'].astype(float)
df['employee'] = df['employee'].astype(float)
df['book'] = df['book'].astype(float)
df['website'] = df['website'].astype(float)
df['credit card'] = df['credit card'].astype(float)
df['shopping cart'] = df['shopping cart'].astype(float)
df['patient'] = df['patient'].astype(float)

In [7]:
df.dtypes

attribute         object
classes           object
user             float64
music            float64
employee         float64
book             float64
website          float64
credit card      float64
shopping cart    float64
patient          float64
dtype: object

# model

In [8]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset

In [9]:
!pip install neattext
import neattext as nt
import neattext.functions as nfx
print(df)
df['attribute'].apply(lambda x:nt.TextFrame(x).noise_scan())
corpus = df['attribute'].apply(nfx.remove_special_characters)



Defaulting to user installation because normal site-packages is not writeable
                  attribute  \
0                 nick name   
1                 user name   
2                 full name   
3                        id   
4                     email   
..                      ...   
155    prescription history   
156          discharge date   
157            is alcoholic   
158               is smoker   
159  primary care physician   

                                               classes  user  music  employee  \
0    ['user', 'music','employee','book','website','...   1.0    0.0       1.0   
1    ['user', 'music','employee','book','website','...   1.0    0.0       0.0   
2    ['user', 'music','employee','book','website','...   1.0    0.0       1.0   
3    ['user', 'music','employee','book','website','...   1.0    0.0       1.0   
4    ['user', 'music','employee','book','website','...   1.0    0.0       1.0   
..                                                 ...   ...   

Extracting features

In [10]:
tfidf = TfidfVectorizer()
Xfeatures = tfidf.fit_transform(corpus).toarray()

In [11]:
Xfeatures

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
y = df[['user', 'music', 'employee', 'book', 'website', 'credit card' ,  'shopping cart','patient']]
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.30,random_state=42)


In [13]:
from skmultilearn.problem_transform import BinaryRelevance

binary_rel_clf = BinaryRelevance(MultinomialNB())


In [14]:
binary_rel_clf.fit(X_train,y_train)


BinaryRelevance(classifier=MultinomialNB(), require_dense=[True, True])

In [15]:
br_prediction = binary_rel_clf.predict(X_test)


In [16]:
br_prediction.toarray()


array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 

In [17]:
print(accuracy_score(y_test,br_prediction))
hamming_loss(y_test,br_prediction)


0.0


0.13802083333333334

In [18]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    clf_predictions = clf.predict(xtest)
    acc = accuracy_score(ytest,clf_predictions)
    f1_sco = f1_score(ytest,clf_predictions,average='micro')
    hamloss = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"f1score":f1_sco,"hamming_loss":hamloss}
    return result

In [19]:
from skmultilearn.problem_transform import ClassifierChain

clf_chain_model = build_model(MultinomialNB(),ClassifierChain,X_train,y_train,X_test,y_test)


In [20]:
clf_chain_model


{'accuracy:': 0.0, 'f1score': 0.0, 'hamming_loss': 0.13802083333333334}

# model 3

In [21]:
from skmultilearn.problem_transform import LabelPowerset

clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)


In [22]:
from sklearn.metrics import precision_score

print("accuracy:",accuracy_score(y_test,br_prediction) ,"f1score",precision_score(y_test,br_prediction,average='macro'),"hammingloss",hamming_loss(y_test,br_prediction) )
print(clf_chain_model)
print(clf_labelP_model)


accuracy: 0.0 f1score 0.0 hammingloss 0.13802083333333334
{'accuracy:': 0.0, 'f1score': 0.0, 'hamming_loss': 0.13802083333333334}
{'accuracy:': 0.25, 'f1score': 0.27722772277227725, 'hamming_loss': 0.19010416666666666}


  _warn_prf(average, modifier, msg_start, len(result))


## prediction

In [23]:
# from sklearn.pipeline import make_pipeline
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# from sklearn.model_selection import cross_val_score
# lr = make_pipeline( CountVectorizer(ngram_range=(1,2)),TfidfTransformer(),MultinomialNB())
# scores = cross_val_score(lr, corpus, y, cv=5)  # Specify the number of folds (e.g., 5-fold cross-validation)
#
# # Print the cross-validated scores
# print("Cross-validated scores:", scores)
# print("Mean accuracy:", scores.mean())
# print("Standard deviation:", scores.std())

In [24]:
ex1 = 'gender'
ex1

'gender'

In [25]:
vec_example = tfidf.transform([ex1])

In [26]:
result=binary_rel_clf.predict(vec_example).toarray()
result

array([[0., 0., 0., 0., 0., 0., 0., 0.]])

In [27]:
def getpredictedclass(result2):
    if result2[0][0]==1:
        print("user")
    if result2[0][1]==1:
        print("music")
    if result2[0][2]== 1:
        print("employee")
    if result2[0][3]== 1:
        print("book")
    if result2[0][4]== 1:
        print("website")
    if result2[0][5]== 1:
        print("credit card")
    if result2[0][6]== 1:
        print("shopping cart")
    if result2[0][7]== 1:
        print("patient")

In [28]:
getpredictedclass(result)

In [29]:
model_label_pwoerset=LabelPowerset(MultinomialNB())
model_label_pwoerset.fit(X_train,y_train)
result2=model_label_pwoerset.predict(vec_example).toarray()
result2

array([[0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [30]:
getpredictedclass(result2)

patient


In [35]:
import pickle

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model_label_pwoerset, open(filename, 'wb'))

# some time later...

# load the model from disk



patient
