In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer ,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,hamming_loss
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

In [3]:
df = pd.read_csv('cleaned_text.csv')
df.head(5)

Unnamed: 0,text,id,label,cleaned_text
0,"Keep your gloves, hats, coats and jackets toge...",122885,Assembly Required,keep glove hat coat jacket togeth need entrywa...
1,"Keep your gloves, hats, coats and jackets toge...",122885,Finish,keep glove hat coat jacket togeth need entrywa...
2,"Keep your gloves, hats, coats and jackets toge...",122885,Hardware Included,keep glove hat coat jacket togeth need entrywa...
3,The Home Dynamix Serendipity Ivory 5 ft. 2 in....,188958,Commercial / Residential,home dynamix serendip ivori ft x ft area r...
4,The Home Dynamix Serendipity Ivory 5 ft. 2 in....,188958,Features,home dynamix serendip ivori ft x ft area r...


In [4]:
df = df[['cleaned_text' , 'id' , 'label']]

In [5]:

# Perform one-hot encoding on the 'label' column
df_encoded = pd.get_dummies(df, columns=['label'] , prefix = '')

# Group by 'text' and aggregate using max to get 1s and 0s
df_grouped = df_encoded.groupby('id').max().reset_index()
df_grouped.columns = df_grouped.columns.str.replace(r'^.*_', '')
# Print the resulting DataFrame
df_grouped.sample(1)

Unnamed: 0,id,text,Assembly Required,Color,Commercial / Residential,ENERGY STAR Certified,Features,Finish,Flooring Product Type,Hardware Included,Included,Indoor/Outdoor,Package Quantity,Shape,Tools Product Type,Voltage (volts),Wattage (watts)
39725,220147,led light fixtur hampton bay featur low profil...,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0


In [6]:
tfidf = TfidfVectorizer(max_features=5000)
Xfeatures = tfidf.fit_transform(df_grouped['text']).toarray()

In [7]:
y = df_grouped[['Assembly Required', 'Color', 'Commercial / Residential',
       'ENERGY STAR Certified', 'Features', 'Finish', 'Flooring Product Type',
       'Hardware Included', 'Included', 'Indoor/Outdoor', 'Package Quantity',
       'Shape', 'Tools Product Type', 'Voltage (volts)', 'Wattage (watts)']]

In [8]:
# Datasplit
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

# Building The model

## Problem Transformation

In [9]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    clf_predictions = clf.predict(xtest)
    train_acc = clf.score(X_train ,y_train)
    test_acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions) * 100
    result = f'Train : {train_acc} || Test : {test_acc} || Hamming_score : {ham}'
    print(model)
    return result , clf

In [10]:
def output(one_df , row_number=0):
    for x in one_df.columns:
        if one_df[x].iloc[row_number] == 1:
            print(x ,end=',')

In [13]:
model = XGBClassifier()

In [None]:
"""
RandomForestClassifier()
'Train : 0.9991064677984741 || Test : 0.8337743565070964 || Hamming_score : 2.1329484403816856'
LogisticRegression:
Train : 0.86634820262561 || Test : 0.8106807794082271 || Hamming_score : 2.4499505519471843
"""

### Binary relevance technique

In [17]:
clf_Binary_rev_model = build_model(model,BinaryRelevance,X_train,y_train,X_test,y_test)
clf_Binary_rev_model

LabelPowerset(classifier=LogisticRegression(solver='liblinear'),
              require_dense=[True, True])


('Train : 0.6907347584026393 || Test : 0.6394836019565392 || Hamming_score : 3.3656750327426295',
 BinaryRelevance(classifier=LabelPowerset(classifier=LogisticRegression(solver='liblinear'),
                                          require_dense=[True, True]),
                 require_dense=[False, False]))

###  Classifier chains technique

In [None]:
clf_chain_model = build_model(model,ClassifierChain,X_train,y_train,X_test,y_test)
clf_chain_model

### Labelpowerset technique

In [None]:
clf_labelP_model , model = build_model(model,LabelPowerset,X_train,y_train,X_test,y_test)
clf_labelP_model

In [15]:
pred = model.predict_proba(X_test).toarray()
cols = y.columns
one_df = pd.DataFrame(pred, columns=cols)
one_df

Unnamed: 0,Assembly Required,Color,Commercial / Residential,ENERGY STAR Certified,Features,Finish,Flooring Product Type,Hardware Included,Included,Indoor/Outdoor,Package Quantity,Shape,Tools Product Type,Voltage (volts),Wattage (watts)
0,0.024744,0.048232,0.046443,0.738547,0.028289,0.019453,0.014283,0.192352,0.041110,0.136336,0.045803,0.041038,0.019748,0.058622,0.451099
1,0.054819,0.074436,0.047836,0.084432,0.028083,0.037152,0.020642,0.046453,0.208064,0.081105,0.056341,0.041022,0.635398,0.094967,0.043088
2,0.037384,0.035273,0.025495,0.028064,0.020044,0.038213,0.013878,0.031318,0.042465,0.046758,0.070601,0.021453,0.784820,0.045094,0.019309
3,0.018578,0.032533,0.025002,0.837792,0.021360,0.013279,0.007814,0.039362,0.023528,0.049011,0.022206,0.020273,0.013541,0.062625,0.559596
4,0.095623,0.075109,0.655035,0.694964,0.647256,0.018808,0.016525,0.026033,0.651362,0.748904,0.024663,0.025039,0.030362,0.112065,0.083496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12466,0.091027,0.078976,0.093130,0.072144,0.047648,0.039665,0.032178,0.099095,0.553848,0.091949,0.040071,0.581822,0.024177,0.442056,0.425850
12467,0.048659,0.044750,0.025961,0.025486,0.648943,0.174319,0.009219,0.076499,0.643900,0.032148,0.045232,0.028473,0.012763,0.024383,0.016471
12468,0.064498,0.046700,0.038041,0.028725,0.031082,0.774870,0.013472,0.047991,0.053860,0.042125,0.029548,0.040011,0.010401,0.028185,0.021685
12469,0.069387,0.111970,0.117467,0.347756,0.057252,0.038566,0.032650,0.070161,0.061632,0.159578,0.052583,0.062779,0.045095,0.454095,0.301134
