Supervised WSD with SVM for the LRL LTZ or

# Supervised Word Sense Disambiguation with Support Vector Machines for the Low-Resource language Luxembourgish:

In [1]:
#Prepare data:
import pandas as pd
#WSD and evaluation:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# Data:

In [2]:
data_Fra = {
    "sentence":["Si ass meng Fra.","Du bass keng Fra.","Déi Fra ass Léierin.","D'Mary ass meng Fra.","Ech sinn eng Fra.","Den Tom ass e Mann, an ech sinn eng Fra.","Meng Fra ass Lëtzebuergerin.","Ech sinn Déierendokter a meng Fra ass Sekretärin.", "Ech presentéieren Iech meng Fra.","Déi ass meng Fra."],
    "sense":[2,1,1,2,1,1,2,2,2,2]
}
data_Meedchen = {
    "sentence":["D'Meedchen huet et och gesinn.","Mäi Meedchen huet muer Rendez-vous beim Zänndokter.","Ech sinn e Meedchen.","Tom a Mary hunn e Meedchen.","Den Tom ass e Jong, an d'Mary ass e Meedchen.","Ech sinn der Madame Schneider hiert Meedchen.","D'Meedchen huet et och gesinn.","Ech sinn der Madame Schneider hiert Meedchen.","Mäi Meedchen huet muer Rendez-vous beim Zänndokter.","Wat ass dat e léift Meedchen!","Hie bestit e räicht Meedchen.","Dem Bäcker säi Meedche gëtt déi aner Woch bestuet.","Säi klengt Meedchen ass eng richteg Babbel."],
    "sense":[1,2,1,2,1,2,1,1,2,1,1,2,2]
}
data_Land = {
    "sentence":["Reen ass rar an dësem Land.","Dëst Land huet e strenge Klima.","Griicheland ass en aalt Land.","Mengem Land säin Numm ass 'Russland'.","Reen ass rar an dësem Land.","Dat ass eng Spezialitéit aus mengem Land.","Eist Land huet Grenze mat dräi anere Länner.","Mäi Cousin war schonn an iwwer 30 Länner an der Vakanz.","Hien ass nees zu Land.","D'ganzt Land steet hanner senger Fussballsekipp.","All d'Awunner vum Land stinn hanner hirer Fussballsekipp.","No véier Deeg um Mier ware mer frou, Land ze gesinn.","D'Passagéier sinn am Hafen u Land gaangen."],
    "sense":[1,1,1,1,1,1,1,1,1,1,1,3,3]
}
data_falsch = {
    "sentence":["Du hues näischt falsch gemaach.","Deng Äntwert ass falsch.","Är Äntwert ass falsch.","Deng Äntwert ass falsch.","Zënter sengem Accident huet dee jonke Mann falsch Zänn.","Vun Zäit zu Zäit dauche falsch Geldschäiner op.","Déi Rechnung ass falsch.","De Klarinettist huet ganz falsch gespillt.","Mir sinn op der Gare an de falsche Bus geklommen.","Hei sidd Dir falsch!","Hei sidd Dir op der falscher Plaz.","Hien huet eng falsch Beweegung gemaach.","Wéi kann een nëmmen esou falsch sinn!","Hien ass e falschen Hond!","Maach der keng falsch Hoffnungen!","Et war falschen Alarm.","Zënter sengem Accident huet dee jonke Mann falsch Zänn.","Vun Zäit zu Zäit dauche falsch Geldschäiner op."],
    "sense":[2,1,1,1,5,5,1,1,2,2,2,2,3,3,4,4,5,5]
}

# Fra:

In [3]:
df = pd.DataFrame(data_Fra)
X = df["sentence"]
y = df["sense"]

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)

X = vectorizer.fit_transform(X)
X = X.toarray()
vocab = vectorizer.get_feature_names()
df["sense"].value_counts()

2    6
1    4
Name: sense, dtype: int64

# Train-Test:

In [4]:
#Split data into test and train:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# X -> LTZ sentences
# y -> senses

In [5]:
#Setup:
svm_clf = LinearSVC(C=1)
#Train:
svm_clf.fit(X_train, y_train)
#Test:
predictionSVC = svm_clf.predict(X_test)
#Evaluate:
print(classification_report(y_test,predictionSVC))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



# k-fold validation:

with shuffle = True:

In [6]:
folds = KFold(n_splits=7, shuffle=True, random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[1.  1.  0.5 1.  1.  1.  1. ]
0.93 accuracy with a standard deviation of 0.17


with shuffle = False:

In [7]:
folds = 4
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[1.         0.66666667 1.         1.        ]
0.92 accuracy with a standard deviation of 0.14


# Meedchen:

In [8]:
df = pd.DataFrame(data_Meedchen)
X = df["sentence"]
y = df["sense"]

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)

X = vectorizer.fit_transform(X)
X = X.toarray()
vocab = vectorizer.get_feature_names()
df["sense"].value_counts()

1    7
2    6
Name: sense, dtype: int64

# Train-Test:

In [9]:
#Split data into test and train:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# X -># Train-Test: LTZ sentences
# y -> senses

In [10]:
#Setup:
svm_clf = LinearSVC(C=1)
#Train:
svm_clf.fit(X_train, y_train)
#Test:
predictionSVC = svm_clf.predict(X_test)
#Evaluate:
print(classification_report(y_test,predictionSVC))

              precision    recall  f1-score   support

           1       0.67      1.00      0.80         2
           2       1.00      0.67      0.80         3

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



# k-fold validation:

with shuffle = True:

In [11]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[1.         0.66666667 0.66666667 0.5        0.5       ]
0.67 accuracy with a standard deviation of 0.18


with shuffle = False:

In [12]:
folds = 5
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[1.         0.66666667 0.66666667 1.         0.5       ]
0.77 accuracy with a standard deviation of 0.20


# Falsch:

In [13]:
df = pd.DataFrame(data_falsch)
X = df["sentence"]
y = df["sense"]

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)

X = vectorizer.fit_transform(X)
X = X.toarray()
vocab = vectorizer.get_feature_names()
df["sense"].value_counts()

1    5
2    5
5    4
3    2
4    2
Name: sense, dtype: int64

# Train-Test:

In [14]:
#Split data into test and train:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# X -> LTZ sentences
# y -> senses

In [15]:
#Setup:
svm_clf = LinearSVC(C=1)
#Train:
svm_clf.fit(X_train, y_train)
#Test:
predictionSVC = svm_clf.predict(X_test)
#Evaluate:
print(classification_report(y_test,predictionSVC))

              precision    recall  f1-score   support

           1       0.50      1.00      0.67         2
           2       1.00      0.50      0.67         2
           3       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         1

    accuracy                           0.67         6
   macro avg       0.62      0.62      0.58         6
weighted avg       0.67      0.67      0.61         6



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# k-fold validation:

with shuffle = True:

In [16]:
folds = KFold(n_splits=6, shuffle=True, random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[0.66666667 0.66666667 0.66666667 1.         0.33333333 0.66666667]
0.67 accuracy with a standard deviation of 0.19


with shuffle = False:

In [17]:
folds = 2
scores = cross_val_score(svm_clf, X, y, cv=folds)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

0.67 accuracy with a standard deviation of 0.00


# Land:

In [18]:
df = pd.DataFrame(data_Land)
X = df["sentence"]
y = df["sense"]

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)

X = vectorizer.fit_transform(X)
X = X.toarray()
vocab = vectorizer.get_feature_names()
df["sense"].value_counts()

1    11
3     2
Name: sense, dtype: int64

# Train-Test:

In [19]:
#Split data into test and train:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# X -> LTZ sentences
# y -> senses

In [20]:
#Setup:
svm_clf = LinearSVC(C=1)
#Train:
svm_clf.fit(X_train, y_train)
#Test:
predictionSVC = svm_clf.predict(X_test)
#Evaluate:
print(classification_report(y_test,predictionSVC))

              precision    recall  f1-score   support

           1       0.80      1.00      0.89         4
           3       0.00      0.00      0.00         1

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# k-fold validation:

with shuffle = True:

In [21]:
folds = KFold(n_splits=9, shuffle=True, random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[0.5 1.  1.  0.5 1.  1.  1.  1.  1. ]
0.89 accuracy with a standard deviation of 0.21


with shuffle = False:

In [22]:
folds = 10
scores = cross_val_score(svm_clf, X, y, cv=folds)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#source: https://scikit-learn.org/stable/modules/cross_validation.html

[1.  0.5 0.5 1.  1.  1.  1.  1.  1.  1. ]
0.90 accuracy with a standard deviation of 0.20


