In [1]:
import os
import pandas as pd
import gzip
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import re

In [2]:
train = pd.read_csv('../../src/data/schemafiltereddata/TrainTestTables/small_train_prepped.csv')

In [3]:
train.drop(['class', 'Unnamed: 0'], axis=1, inplace=True)

In [4]:
test = pd.read_csv('../../src/data/schemafiltereddata/TrainTestTables/test_tables_seq.csv')
test["label_complete"] = test["name"] + test["class"]

In [5]:
test.drop(test[(test['name']=='inalbum') &(test['class']=='MusicRecording')].index, inplace=True)
test.drop(test[(test['name']=='performer') &(test['class']=='Event')].index, inplace=True)
test.drop(test[(test['name']=='eventattendancemode') &(test['class']=='Event')].index, inplace=True)
test.drop(test[(test['name']=='eventstatus') &(test['class']=='Event')].index, inplace=True)
test.shape

(7033, 5)

In [6]:
test.drop(['Unnamed: 0', 'class'], axis=1, inplace=True)

In [7]:
test['text'] = test.text.apply(lambda x: re.sub("[^0-9a-zA-Z-@]+", " ", x))

In [8]:
labels_new = {'worstratingProduct': 0,
 'acceptsreservationsRestaurant': 1,
 'additionalnamePerson': 2,
 'additionalpropertyProduct': 3,
 'addressHotel': 4,
 'addressLocalBusiness': 5,
 'addressPerson': 6,
 'addressProduct': 7,
 'addressRestaurant': 8,
 'addresslocalityLocalBusiness': 9,
 'addressregionLocalBusiness': 10,
 'affiliationPerson': 11,
 'aggregateratingBook': 12,
 'aggregateratingCreativeWork': 13,
 'aggregateratingHotel': 14,
 'aggregateratingLocalBusiness': 15,
 'aggregateratingProduct': 16,
 'aggregateratingRecipe': 17,
 'aggregateratingRestaurant': 18,
 'alternatenameProduct': 19,
 'alternativeheadlineCreativeWork': 20,
 'areaservedLocalBusiness': 21,
 'articlebodyCreativeWork': 22,
 'audienceProduct': 23,
 'authorBook': 24,
 'authorCreativeWork': 25,
 'authorProduct': 26,
 'authorRecipe': 27,
 'availabilityProduct': 28,
 'bestratingProduct': 29,
 'birthdatePerson': 30,
 'birthplacePerson': 31,
 'bookeditionBook': 32,
 'bookformatBook': 33,
 'brandProduct': 34,
 'breadcrumbProduct': 35,
 'byartistMusicAlbum': 36,
 'byartistMusicRecording': 37,
 'categoryProduct': 38,
 'citystatezipLocalBusiness': 39,
 'colorProduct': 40,
 'commentcountCreativeWork': 41,
 'conditionProduct': 42,
 'contactpointLocalBusiness': 43,
 'contactpointPerson': 44,
 'cookingmethodRecipe': 45,
 'cooktimeRecipe': 46,
 'copyrightholderCreativeWork': 47,
 'copyrightyearCreativeWork': 48,
 'creatorCreativeWork': 49,
 'datecreatedCreativeWork': 50,
 'datecreatedMusicRecording': 51,
 'datemodifiedCreativeWork': 52,
 'datemodifiedRecipe': 53,
 'datepublishedBook': 54,
 'datepublishedCreativeWork': 55,
 'datepublishedMusicRecording': 56,
 'datepublishedProduct': 57,
 'datepublishedRecipe': 58,
 'deathdatePerson': 59,
 'depthProduct': 60,
 'disambiguatingdescriptionProduct': 61,
 'doortimeEvent': 62,
 'durationEvent': 63,
 'durationMusicRecording': 64,
 'emailHotel': 65,
 'emailLocalBusiness': 66,
 'emailPerson': 67,
 'enddateEvent': 68,
 'episodenumberTVEpisode': 69,
 'worksforPerson': 70,
 'familynamePerson': 71,
 'faxnumberLocalBusiness': 72,
 'faxnumberPerson': 73,
 'genderPerson': 74,
 'genreBook': 75,
 'genreCreativeWork': 76,
 'genreMusicRecording': 77,
 'geoHotel': 78,
 'geoLocalBusiness': 79,
 'geoPlace': 80,
 'geoProduct': 81,
 'geoRestaurant': 82,
 'givennamePerson': 83,
 'gtin12Product': 84,
 'gtin13Product': 85,
 'gtin14Product': 86,
 'gtin8Product': 87,
 'gtinProduct': 88,
 'hasmapLocalBusiness': 89,
 'hasmenuRestaurant': 90,
 'headlineCreativeWork': 91,
 'headlineRecipe': 92,
 'heightPerson': 93,
 'heightProduct': 94,
 'homelocationPerson': 95,
 'identifierProduct': 96,
 'worklocationPerson': 97,
 'ingredientsRecipe': 98,
 'inlanguageBook': 99,
 'inlanguageCreativeWork': 100,
 'interactioncountCreativeWork': 101,
 'interactionstatisticCreativeWork': 102,
 'interactiontypeCreativeWork': 103,
 'isbnBook': 104,
 'ispartofRecipe': 105,
 'isrelatedtoProduct': 106,
 'issimilartoProduct': 107,
 'itemconditionProduct': 108,
 'itemlistelementProduct': 109,
 'jobtitlePerson': 110,
 'keywordsCreativeWork': 111,
 'keywordsRecipe': 112,
 'knowslanguagePerson': 113,
 'legalnameLocalBusiness': 114,
 'locationEvent': 115,
 'locationLocalBusiness': 116,
 'mainentityofpageCreativeWork': 117,
 'mainentityofpagePerson': 118,
 'mainentityofpageProduct': 119,
 'mainentityofpageRecipe': 120,
 'makesofferPerson': 121,
 'manufacturerProduct': 122,
 'materialProduct': 123,
 'memberofPerson': 124,
 'menuRestaurant': 125,
 'modelProduct': 126,
 'mpnProduct': 127,
 'nameBook': 128,
 'nameCreativeWork': 129,
 'nameEvent': 130,
 'nameHotel': 131,
 'nameLocalBusiness': 132,
 'nameMusicAlbum': 133,
 'nameMusicRecording': 134,
 'namePlace': 135,
 'nameProduct': 136,
 'nameRecipe': 137,
 'nameRestaurant': 138,
 'nameTVEpisode': 139,
 'nationalityPerson': 140,
 'numberofpagesBook': 141,
 'numtracksMusicAlbum': 142,
 'nutritionRecipe': 143,
 'weightProduct': 144,
 'offersBook': 145,
 'offersCreativeWork': 146,
 'offersProduct': 147,
 'openinghoursLocalBusiness': 148,
 'openinghoursRestaurant': 149,
 'openinghoursspecificationLocalBusiness': 150,
 'openinghoursspecificationPlace': 151,
 'openinghoursspecificationRestaurant': 152,
 'organizerEvent': 153,
 'partofseriesTVEpisode': 154,
 'paymentacceptedLocalBusiness': 155,
 'widthProduct': 156,
 'performersEvent': 157,
 'performtimeRecipe': 158,
 'postalcodeLocalBusiness': 159,
 'preptimeRecipe': 160,
 'priceProduct': 161,
 'pricecurrencyProduct': 162,
 'pricerangeHotel': 163,
 'pricerangeLocalBusiness': 164,
 'pricerangeRestaurant': 165,
 'publisherBook': 166,
 'publisherCreativeWork': 167,
 'publisherRecipe': 168,
 'ratingvalueProduct': 169,
 'recipecategoryRecipe': 170,
 'recipecuisineRecipe': 171,
 'recipeingredientRecipe': 172,
 'recipeinstructionsRecipe': 173,
 'recipeyieldRecipe': 174,
 'releasedateProduct': 175,
 'reviewLocalBusiness': 176,
 'reviewProduct': 177,
 'reviewRecipe': 178,
 'reviewcountProduct': 179,
 'reviewsProduct': 180,
 'sameasLocalBusiness': 181,
 'sameasPerson': 182,
 'sameasPlace': 183,
 'servescuisineRestaurant': 184,
 'shop-currencyProduct': 185,
 'starratingHotel': 186,
 'streetaddressLocalBusiness': 187,
 'suitablefordietRecipe': 188,
 'telephoneHotel': 189,
 'telephoneLocalBusiness': 190,
 'telephonePerson': 191,
 'telephonePlace': 192,
 'telephoneRestaurant': 193,
 'titleProduct': 194,
 'totaltimeRecipe': 195,
 'trackMusicAlbum': 196,
 'typicalagerangeEvent': 197,
 'versionCreativeWork': 198,
 'weightPerson': 199,
 'offerdetailsProduct': 200,
 'founderLocalBusiness': 201
 #'eventattendancemodeEvent': 201,
 #'performerEvent': 202,
 #'inalbumMusicRecording': 203,
 #'eventstatusEvent': 204
 }

In [9]:
test['label'] = test.label_complete.map(labels_new)

In [10]:
test.drop(['label_complete', 'name'], axis=1, inplace=True)

In [11]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [12]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [13]:
train['train'] = 1

In [14]:
test['train'] = 0

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bizer-
[nltk_data]     tp2021/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/bizer-
[nltk_data]     tp2021/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
data = train.append(test)

In [18]:
data.shape

(18389, 3)

In [19]:
data['concat'] = data['text'].apply(lambda row: row.lower())

In [20]:
data['tokens'] = data['concat'].apply(lambda row: word_tokenize(row))

In [21]:
data['tokens'] = remove_stopwords(data['tokens'], stopwords.words())

In [22]:
data['tokens'] = remove_punctuation(data['tokens'])

In [23]:
data.drop(columns=['concat'],inplace=True)

In [24]:
data.to_csv('../../src/data/schemafiltereddata/TrainTestTables/tokenized_rf.csv')

Einlesen

In [122]:
data = pd.read_csv('../../src/data/schemafiltereddata/TrainTestTables/tokenized_rf.csv')

In [123]:
data.drop(['Unnamed: 0', 'text'], axis=1, inplace=True)

In [124]:
train = data.head(11356)

In [125]:
test = data.tail(7033)

In [126]:
def dummy(doc):
    return doc

vectorizer  = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        max_features=30000,
        binary=True)  
tf_value = vectorizer.fit_transform(train['tokens'])

In [127]:
#define vectorizer to match preprocessed tokes
#def dummy_fun(doc):
 #   return doc

#tfidf = TfidfVectorizer(
 #   analyzer='word',
  #  tokenizer=dummy_fun,
   # preprocessor=dummy_fun,
    #token_pattern=None,
    #max_features=15000)  
#tfidf_value = tfidf.fit_transform(data['tokens'])

In [20]:
#df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())

In [15]:
#df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())

In [22]:
#df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())
#df_tfidf = pd.DataFrame(tfidf_value.toarray(), columns=tfidf.get_feature_names())
#df_prepared = pd.concat([data.reset_index(), df_tfidf, df_tf], axis=1)

Train

In [128]:
df_tf = pd.DataFrame(tf_value.toarray(), columns=vectorizer.get_feature_names())

In [129]:
train = pd.concat([train.reset_index(), df_tf], axis=1)

In [130]:
train.drop(['index'], inplace=True, axis=1)

Test

In [131]:
test_vec = vectorizer.transform(test['tokens'])

In [132]:
df_tf_test = pd.DataFrame(test_vec.toarray(), columns=vectorizer.get_feature_names())

In [133]:
test = pd.concat([test.reset_index(), df_tf_test], axis=1)

In [43]:
#y_train = train['label']
#X_train = train.drop(columns=['tokens','label', 'text', 'train'])

In [134]:
y_train = train['label']

In [135]:
X_train = train.drop(columns=['tokens','label', 'train'])

In [74]:
#y_train = y_train.iloc[:, 0]

In [136]:
y_test = test['label']
X_test = test.drop(columns=['tokens','label', 'train'])

In [33]:
#y_test = y_test.iloc[:, 0]

In [137]:
X_train.dtypes.value_counts()

int64    3798
dtype: int64

In [138]:
# Baseline random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [139]:
X_test.drop('index', axis=1, inplace=True)

In [140]:
prediction = rf.predict(X_test) 
f1_mic = f1_score(y_test,prediction,average='micro')
f1_mac = f1_score(y_test,prediction,average='macro')
accuracy = accuracy_score(y_test,prediction) 
precision = precision_score(y_test,prediction,average='micro') 
recall = recall_score(y_test,prediction,average='micro') 
precision_mac = precision_score(y_test,prediction,average='macro') 
recall_mac = recall_score(y_test,prediction,average='macro') 
print("The F1-Score micro on test set: {:.4f}".format(f1_mic))
print("The F1-Score macro on test set: {:.4f}".format(f1_mac))
print("The Precision on test set: {:.4f}".format(precision))
print("The Recall on test set: {:.4f}".format(recall))
print("The Precision macro on test set: {:.4f}".format(precision_mac))
print("The Recall macro on test set: {:.4f}".format(recall_mac))
print("The Accuracy-Score on test set: {:.4f}".format(accuracy))

The F1-Score micro on test set: 0.2758
The F1-Score macro on test set: 0.1402
The Precision on test set: 0.2758
The Recall on test set: 0.2758
The Precision macro on test set: 0.1914
The Recall macro on test set: 0.1495
The Accuracy-Score on test set: 0.2758


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
data.tokens[0].strip('][').split(', ').append(['0'])
s = set(data.tokens[0].strip('][').split(', '))
for i in range(1, data.shape[0]):
    s.update(data.tokens[i].strip('][').split(', '))
    
len(s)

In [56]:
l = list(s)

In [None]:
for col in l:
    data[col] = ""

In [None]:
data

In [None]:
for el in data.tokens.split():
    for colname in l:
    if el == colname:
        data[colname] = 1
    else:
        data[colname] = 0