In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
import textdistance
from sklearn.neighbors import KNeighborsClassifier

1. Read csv file with all the data (Train & test)
2. Fill NaN values with 0
3. split test and train with TargetAttribute

In [2]:
complete_dataset = pd.read_csv("mergedOut.csv")
complete_dataset=complete_dataset.fillna(value=0)


Preprocessing:
    For removal of comments
    Check the similarity of attributes with labels and keep only those Attributes which are related to final target labels
    using textdistance (sequence based algorithm : Ratcliff-Obershelp similarity)

In [3]:

dataframe= complete_dataset[['Attribute']].copy()
dataframe = dataframe.drop_duplicates()


lookup_table = pd.read_csv('monitor_mediated_schema.txt')


final_word_dict_value = {}

for index, word in tqdm(dataframe.iterrows()):
    temp_word_list =[]
    for inner_index, inner_word in lookup_table.iterrows():
        val = textdistance.ratcliff_obershelp(str(word), str(inner_word))
        temp_word_list.append(val)
    
    final_word_dict_value[str(word[0])] = max(temp_word_list)
        

final_word_df_value = pd.DataFrame(final_word_dict_value.items())
final_word_df_value.columns = ['Attribute', 'score']


final_word_df_value1 = final_word_df_value[final_word_df_value['score'] >= 0.63]
final_word_df_value2 = final_word_df_value[final_word_df_value['score'] < 0.63]

1655it [02:28, 11.18it/s]


In [4]:
final_word_df_value1.to_csv("relevant.csv")
final_word_df_value2.to_csv("irRelevant.csv")

In [5]:
complete_dataset = complete_dataset.merge(final_word_df_value1, how='inner', on = ['Attribute'])


In [6]:
complete_dataset.to_csv('cleaned2.csv',sep=",")

In [276]:
complete_dataset_frame= pd.read_csv("cleaned2.csv")
complete_dataset_frame

Unnamed: 0,Source,Attribute,TargetAttribute,Value,Numeric,Boolean,String,DateTime,Currency,Dimensions,Weight,Scale,Ratio,Frequency,Power,Temperature,Colors
0,ca.pcpartpicker.com,aspect ratio,0,16:9 16:10 16:9 16:9 16:10 16:10 16:10 16:9 1...,0.0,0.0,0.0,0,0,0,0.0,0.0,100.0,0.0,0.0,0,0
1,catalog.com,aspect ratio,0,16:9 16:9 5:4 4:3 16:9 16:9 16:9 16:9 5:4 16:...,0.0,0.0,0.0,0,0,0,0.0,0.0,100.0,0.0,0.0,0,0
2,www.best-deal-items.com,aspect ratio,0,5:4 5:4 5:4 16:9 5:4 5:4 5:4 16:10 N/A 5:4 St...,0.0,0.0,0.0,0,0,0,0.0,0.0,100.0,0.0,0.0,0,0
3,www.cleverboxes.com,aspect ratio,0,16:9 16:10 16:9 16:9 16:9 16:9 16:9 16:9 16:9...,0.0,0.0,0.0,0,0,0,0.0,0.0,100.0,0.0,0.0,0,0
4,www.ebay.com,aspect ratio,0,5:4 16:9 16:9 16:9 16:9 4:3 ['16:9' '16:9' '1...,0.0,0.0,0.0,0,0,0,0.0,0.0,100.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,www.xpcpro.com,vesa bohrung,0,100 x 100 mm 100 x 100 mm 100 x 100 mm 100 x ...,0.0,0.0,100.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
3315,www.xpcpro.com,farbe allgemein,0,schwarz schwarz schwarz schwarz schwarz schwa...,0.0,0.0,100.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
3316,www.xpcpro.com,strahlungsnorm,0,TCO 5.0 TCO 6.0 TCO 06 TCO 6.0,0.0,0.0,100.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
3317,www.xpcpro.com,io ports,0,Inputs: - 15-Pin D-Sub - Digital 29-Pin DVI-D...,0.0,0.0,100.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0


Encode Source, Attribute columns of the dataset as these are categories and scikit does not accept string values
Using LabelEncoder
Split the preprocessed dataframe to train and test based on the TargetAttribute (presence or absence)

In [238]:
complete_dataset_frame= pd.read_csv("cleaned.csv")
complete_dataset_frame=complete_dataset_frame.fillna(value=0)
complete_dataset_frame = complete_dataset_frame.drop(['Value','score','Unnamed: 0'], axis=1)
#complete_dataset_frame = complete_dataset_frame.apply(lambda x: [y if y ==0 or type(y)==str else 1 for y in x])
modifiedDf= pd.DataFrame(complete_dataset_frame).copy()
le_web = preprocessing.LabelEncoder()
le_attr = preprocessing.LabelEncoder()
le_web.fit(modifiedDf['Source'])
web_cat=le_web.transform(modifiedDf['Source'])
le_attr.fit(modifiedDf['Attribute'])
attr_cat=le_attr.transform(modifiedDf['Attribute'])
modifiedDf['Source']=web_cat
modifiedDf['Attribute']=attr_cat


trainDf=modifiedDf.loc[modifiedDf['TargetAttribute'] != '0']
targetAttributes= trainDf["TargetAttribute"]
predDf=modifiedDf.loc[modifiedDf['TargetAttribute'] == '0']

In [239]:
trainDf

Unnamed: 0,Source,Attribute,TargetAttribute,Numeric,Boolean,String,DateTime,Currency,Dimensions,Weight,...,color,support,series,â,combrand,coating,supported,pivot,rohs,active
5,8,65,supported_aspect_ratio,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,9,65,supported_aspect_ratio,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
31,4,252,contrast_ratio_dynamic,0.0,0.000000,49.504950,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
52,8,374,has_dvi_port,0.0,100.000000,0.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
59,0,491,hdmi_port_quantity,100.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3230,18,1335,usb_port_quantity,0.0,0.990099,99.009901,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3280,20,599,supported_resolution,0.0,0.000000,100.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3298,21,187,color_feet,0.0,0.000000,100.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3303,21,224,working_humidity,0.0,0.000000,100.000000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [240]:
predDf

Unnamed: 0,Source,Attribute,TargetAttribute,Numeric,Boolean,String,DateTime,Currency,Dimensions,Weight,...,color,support,series,â,combrand,coating,supported,pivot,rohs,active
0,0,65,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,65,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,65,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,65,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5,65,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3364,25,1358,0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3365,25,435,0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3366,25,1210,0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3367,25,607,0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [241]:
y= pd.DataFrame(targetAttributes)
y["TargetAttribute"] = pd.Categorical(y["TargetAttribute"])
y['code'] = y["TargetAttribute"].cat.codes
labelCodes=y
trainDf=trainDf.drop(['TargetAttribute'], axis=1)
predDf=predDf.drop(['TargetAttribute'], axis=1)

In [247]:
X_train, X_test, y_train, y_test = train_test_split(
    trainDf, y["code"], test_size=0.20, random_state=42)

clf = OneVsRestClassifier(SVC(C=1.0, kernel='rbf', gamma=1.0, probability=True)).fit(X_train, y_train)
#clf=KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

clf.score(X_test, y_test)

0.16666666666666666

In [243]:
y_pred = cross_val_predict(clf, X_test, y_test,cv=3)

results = [str(i) for i in list(set(y_test))]

conf_mat = confusion_matrix(y_test, y_pred)
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred, target_names=results))

ValueError: n_splits=3 cannot be greater than the number of members in each class.

In [237]:
predDf= predDf.loc[:, predDf.columns != 'TargetAttribute']

vals= clf.predict(predDf)

print(vals)


ValueError: could not convert string to float: 'ca.pcpartpicker.com'

In [186]:
predDf['code']=vals

predDf['Source']=le_web.inverse_transform(predDf['Source'])

predDf['Attribute']=le_attr.inverse_transform(predDf['Attribute'])
#predDf.to_csv("finalResult.csv")


In [187]:
predDf
lab_dict=labelCodes.to_dict()
uniq=labelCodes.drop_duplicates()
fd=pd.merge(uniq,predDf,  on=["code"], how="inner")
finaldf= pd.concat([fd['Source'],fd['Attribute'],fd['TargetAttribute']],axis=1,keys=['Source', 'Attribute','Label'])
finaldf.to_csv("finalResult.csv")

In [188]:
finaldf

Unnamed: 0,Source,Attribute,Label
0,ca.pcpartpicker.com,contrast ratio,contrast_ratio_dynamic
1,catalog.com,contrast ratio,contrast_ratio_dynamic
2,www.best-deal-items.com,contrast ratio,contrast_ratio_dynamic
3,www.ebay.com,contrast ratio,contrast_ratio_dynamic
4,www.imldirect.it,contrast ratio,contrast_ratio_dynamic
...,...,...,...
3274,www.odsi.co.uk,combined with,color_feet
3275,www.ohc24.ch,character resolution,color_feet
3276,www.planet-computer.it,convenienza,color_feet
3277,www.planet-computer.it,connessioni audio,color_feet
