In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_German/code

/content/drive/MyDrive/ML_German/code


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
import joblib
from string import punctuation

In [None]:
kf = KFold(n_splits=4, random_state=42, shuffle=True)

In [None]:
training_data = pd.read_csv('../csv_files/training_no_emoji.csv')

In [None]:
german_stopwwords = nltk.corpus.stopwords.words('german')
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts if text not in german_stopwwords]

In [None]:
t = 0
for train_index, test_index in kf.split(training_data):
    X_train, X_test = filter_texts(training_data['Text'][train_index]), filter_texts(training_data['Text'][test_index])
    y_train , y_test = training_data['Lat'][train_index], training_data['Lat'][test_index]
    with open('./texts/train_text' + str(t) + '.txt', "w") as file:
      for text in X_train:
        file.write(text + "\n")
    with open('./texts/train_lat' + str(t) + '.txt', "w") as file:
      for lat in y_train:
        file.write(str(lat) + "\n")
    with open('./texts/test_text' + str(t) + '.txt', "w") as file:
      for text in X_test:
        file.write(text + "\n")
    with open('./texts/test_lat' + str(t) + '.txt', "w") as file:
      for lat in y_test:
        file.write(str(lat) + "\n")
    t += 1

In [None]:
!java -Xmx5096m ComputeStringKernel 1 spectrum 3 5 ./texts/train_text0.txt ./kernels/train_kern0.txt 

Loaded 16937 samples from ./texts/train_text0.txt
Computing the spectrum kernel based on 3-5-grams ...
Computed kernel to row 99 in 96506 ms
Computed kernel to row 199 in 91677 ms
Computed kernel to row 299 in 98464 ms
Computed kernel to row 399 in 90429 ms
Computed kernel to row 499 in 94533 ms
Computed kernel to row 599 in 92208 ms
Computed kernel to row 699 in 91483 ms
Computed kernel to row 799 in 87309 ms
Computed kernel to row 899 in 90769 ms
Computed kernel to row 999 in 87746 ms
Computed kernel to row 1099 in 91834 ms
Computed kernel to row 1199 in 87793 ms
Computed kernel to row 1299 in 86937 ms
Computed kernel to row 1399 in 84749 ms
Computed kernel to row 1499 in 86303 ms
Computed kernel to row 1599 in 84893 ms
Computed kernel to row 1699 in 84914 ms
Computed kernel to row 1799 in 83955 ms
Computed kernel to row 1899 in 84878 ms
Computed kernel to row 1999 in 85613 ms
Computed kernel to row 2099 in 82222 ms
Computed kernel to row 2199 in 83067 ms
Computed kernel to row 2299 

In [None]:
# with open('./kernels/train_kern0.txt', "r") as file:
#   mat = [[int(num) for num in line.split(' ') if num != '\n'] for line in file]
# training_kernel = np.array(mat)
# np.savez_compressed('./kernels/train_kernel0.npz', training_kernel)

In [4]:
with open('./texts/train_lat0.txt', "r") as file:
  mat = [[float(num) for num in line.split(' ') if num != '\n'] for line in file]
training_lats0 = np.array(mat)

In [None]:
# with open('./kernels/test_kern0.txt', "r") as file:
#   mat = [[int(num) for num in line.split(' ') if num != '\n'] for line in file]
# test_kernel = np.array(mat)
# np.savez_compressed('./kernels/test_kernel0.npz', test_kernel)

In [5]:
with open('./texts/test_lat0.txt', "r") as file:
  mat = [[float(num) for num in line.split(' ') if num != '\n'] for line in file]
test_lats0 = np.array(mat)

In [6]:
train_kern = np.load('./kernels/train_kernel0.npz')['arr_0']
test_kern = np.load('./kernels/test_kernel0.npz')['arr_0']

In [7]:
from sklearn.svm import SVR, NuSVR
from sklearn.kernel_ridge import KernelRidge
import joblib
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

In [None]:
nuSvr = NuSVR(C = 10, kernel='precomputed', cache_size = 2000, max_iter = 10**8)

In [None]:
nuSvr.fit(training_kernel, training_lats0.ravel())

NuSVR(C=10, cache_size=2000, coef0=0.0, degree=3, gamma='scale',
      kernel='precomputed', max_iter=100000000, nu=0.5, shrinking=True,
      tol=0.001, verbose=False)

In [None]:
preds = nuSvr.predict(np.transpose(test_kernel))

In [None]:
MAE(preds, test_lats0.ravel())

0.7846815335811589

In [8]:
kernRidge = KernelRidge(kernel='precomputed')

In [9]:
kernRidge.fit(train_kern, training_lats0.ravel())

KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='precomputed',
            kernel_params=None)

In [11]:
preds = kernRidge.predict(np.transpose(test_kern))

In [12]:
MAE(preds, test_lats0.ravel())

27.64934823221933

In [None]:
joblib.dump(nuSvr, './nuSvr_1.pkl')

['./nuSvr_1.pkl']