In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.model_selection import KFold 
from sklearn.svm import SVR, NuSVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import joblib
from string import punctuation
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

In [None]:
def cross_validation(model, train_kernels, train_lats, test_kernels, test_lats, scaler = None, scaler2 = None):
  maes_sum = 0 
  for tr_kern, tr_lats, tst_kern, tst_lats in zip(train_kernels, train_lats, test_kernels, test_lats):
    kernel = np.load(tr_kern)['arr_0']
    if scaler is not None:
      kernel = scaler.fit_transform(kernel)
    with open(tr_lats, "r") as file:
      mat = [[float(num) for num in line.split(' ') if num != '\n'] for line in file]
    train_lats_ = np.array(mat)
    if scaler2 is not None:
      train_lats_ = scaler2.fit_transform(train_lats_)

    model.fit(kernel, train_lats_.ravel())
    
    kernel = np.load(tst_kern)['arr_0']
    if scaler is not None:
      kernel = scaler.transform(kernel)
    with open(tst_lats, "r") as file:
      mat = [[float(num) for num in line.split(' ') if num != '\n'] for line in file]
    test_lats_ = np.array(mat)
    if scaler2 is not None:
      test_lats_  = scaler2.transform(test_lats_ )
    predictions = model.predict(kernel)

    mae = 0
    for pred, lat in zip(predictions, test_lats_.ravel()):
      mae += abs(pred - lat)
    print(mae / len(predictions), end = ' ')
    maes_sum += mae / len(predictions)
  print("\nFinal sum: ", maes_sum / 4)

In [None]:
train_kernels = ['./kernels/train_kernel' + str(i) + '.npz' for i in range(4)]
train_lats = ['./texts/train_lat' + str(i) + '.txt' for i in range(4)]
test_kernels = ['./kernels/test_kernel' + str(i) + '.npz' for i in range(4)]
test_lats = ['./texts/test_lat' + str(i) + '.txt' for i in range(4)]

In [None]:
nuSvr = NuSVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**7)

In [None]:
cross_validation(nuSvr, train_kernels, train_lats, test_kernels, test_lats)

0.7351710825371262 0.7426166033489675 0.7371014613695526 0.7300427857136514 
Final sum:  0.7362329832423244


In [None]:
nuSvr1 = NuSVR(C = 10, kernel = 'precomputed', cache_size = 1500, max_iter = 10**7)

In [None]:
cross_validation(nuSvr1, train_kernels, train_lats, test_kernels, test_lats)

0.7351633282902392 0.7426165460703571 0.7370953208365569 0.730050566958823 
Final sum:  0.736231440538994


In [None]:
Svr = SVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**7)

In [None]:
cross_validation(Svr, train_kernels, train_lats, test_kernels, test_lats)

0.7104264599542969 0.7161269411489446 0.7121362346947109 0.7066528656546831 
Final sum:  0.7113356253631589


In [None]:
Svr1 = SVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**8)

In [None]:
cross_validation(Svr1, train_kernels, train_lats, test_kernels, test_lats)

0.7104264599542969 0.7161269411489446 0.7121362346947109 0.7066528656546831 
Final sum:  0.7113356253631589


In [None]:
std_scaler = StandardScaler()
Svr1 = SVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**8)

In [None]:
cross_validation(Svr1, train_kernels, train_lats, test_kernels, test_lats, std_scaler)

9.050413997337134 11.133035939974246 10.6845831129904 10.093087971596146 
Final sum:  10.240280255474481


In [None]:
mnm_scaler = MinMaxScaler()
Svr1 = SVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**8)

In [None]:
cross_validation(Svr1, train_kernels, train_lats, test_kernels, test_lats, mnm_scaler)

927.5927906199399 937.791371011961 941.8398528862931 951.9316337347775 
Final sum:  939.7889120632428


In [None]:
std_scaler1 = StandardScaler()
std_scaler2 = StandardScaler()
Svr1 = SVR(kernel = 'precomputed', cache_size = 1500, max_iter = 10**8)

In [None]:
cross_validation(Svr1, train_kernels, train_lats, test_kernels, test_lats, std_scaler1, std_scaler2)

10.052607571447535 12.306302717782469 11.90161826591568 11.260747803887305 
Final sum:  11.380319089758247


New Try

In [5]:
training_data = pd.read_csv('./csv_files/training_no_emoji.csv')
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv')

In [6]:
german_stopwords = stopwords.words('german')

In [7]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [8]:
y_lats = training_data['Lat']
y_longs = training_data['Long']
y_values = np.array([[lat, lon] for lat, lon in zip(y_lats, y_longs)])

In [9]:
count_vect = TfidfVectorizer(stop_words= german_stopwords , ngram_range=(3, 5), analyzer='char', max_features=60000)

In [10]:
data_train, data_test, values_train, values_test = train_test_split(training_data['Text'], y_values, test_size=0.2, random_state = 0)

In [None]:
data_train, values_train = training_data['Text'], training_data['Lat']
data_test, values_test = validation_data['Text'], validation_data['Lat']

In [11]:
data_train = filter_texts(data_train)
data_test = filter_texts(data_test)

In [12]:
X_train = count_vect.fit_transform(data_train)
X_test = count_vect.transform(data_test)

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

In [40]:
GradientBoostingRegressor(n_iter_no_change=5, learning_rate=0.1, n_estimators=250, max_depth=40, random_state=0, alpha=0.8, ccp_alpha=0.0, max_features='auto', loss='huber')

GradientBoostingRegressor(alpha=0.8, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=40, max_features='auto',
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=250, n_iter_no_change=5,
                          presort='deprecated', random_state=0, subsample=1.0,
                          tol=0.0001, validation_fraction=0.1, verbose=0,
                          warm_start=False)

In [41]:
gdR.fit(X_train, values_train[:,1])

      Iter       Train Loss   Remaining Time 
         1           0.8657           24.04m
         2           0.8263           23.09m
         3           0.7907           22.73m
         4           0.7650           22.59m
         5           0.7430           22.41m
         6           0.7214           22.28m
         7           0.7039           22.17m
         8           0.6834           22.07m
         9           0.6651           21.99m
        10           0.6505           21.92m
        20           0.5444           21.22m
        30           0.4900           20.52m
        40           0.4581           19.85m
        50           0.4365           19.19m
        60           0.4208           18.54m
        70           0.4066           17.89m
        80           0.3958           17.24m
        90           0.3867           16.59m
       100           0.3787           15.93m
       200           0.3240            9.52m
       300           0.2911            3.16m


GradientBoostingRegressor(alpha=0.8, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=350,
                          n_iter_no_change=5, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [42]:
predict = gdR.predict(X_test)

In [43]:
MAE(values_test[:,1], predict)

0.6900410443724461

0.7015082995980139

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
sgd = SGDRegressor(max_iter=10**9, learning_rate='adaptive', eta0 = 0.1)

In [None]:
sgd.fit(X_train, values_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.1, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='squared_loss', max_iter=1000000000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [None]:
predict = sgd.predict(X_test)
MAE(values_test, predict)

0.5101271714044763

0.511059081600427 0.5101384689212188

In [None]:
sgd1 = SGDRegressor(max_iter=10**9, learning_rate='adaptive', eta0 = 0.01)

In [None]:
sgd1.fit(X_train, values_train[:,1])

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='adaptive', loss='squared_loss', max_iter=1000000000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [None]:
predict1 = sgd1.predict(X_test)
MAE(values_test[:,1], predict1)

0.6976766426253173

In [None]:
MAE(values_test[:,0], predict) + MAE(values_test[:,1], predict1) / 2

0.8676358277977503

In [None]:
mae = 0
for pred, pred1, val, val1 in zip(predict, predict1, values_test[:,0], values_test[:,1]):
  mae += abs(pred - val) + abs(pred1 - val1)
print(mae/(2 * len(predict)))

0.6082370745552064


In [None]:
svr = NuSVR(nu = 0.5, C = 10, cache_size = 1500, max_iter = 10**8)

In [None]:
svr.fit(X_train, values_train)

NuSVR(C=10, cache_size=1500, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=100000000, nu=0.5, shrinking=True, tol=0.001, verbose=False)

In [None]:
predict = svr.predict(X_test)

In [None]:
MAE(values_test, predict)

0.5074418052945137

In [None]:
joblib.dump(svr, 'nuSVR.pkl')

['nuSVR.pkl']

In [None]:
svr1 = SVR(cache_size = 1500, max_iter = 10**8, kernel = 'linear')

In [None]:
svr1.fit(X_train, values_train)

SVR(C=1.0, cache_size=1500, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=100000000, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
predict = svr1.predict(X_test)
MAE(values_test, predict)

0.5429949649910263

In [None]:
!java -Xmx5096m ComputeStringKernel 2 spectrum 3 5 ./texts/train_text0.txt ./texts/test_text0.txt ./kernels/test_kern0.txt 

Loaded 16937 samples from ./texts/train_text0.txt
Loaded 5646 samples from ./texts/test_text0.txt
Computing the spectrum kernel based on 3-5-grams ...
Computed kernel to row 99 in 36409 ms
Computed kernel to row 199 in 34379 ms
Computed kernel to row 299 in 37066 ms
Computed kernel to row 399 in 34151 ms
Computed kernel to row 499 in 36299 ms
Computed kernel to row 599 in 35935 ms
Computed kernel to row 699 in 35627 ms
Computed kernel to row 799 in 33794 ms
Computed kernel to row 899 in 35523 ms
Computed kernel to row 999 in 34115 ms
Computed kernel to row 1099 in 36182 ms
Computed kernel to row 1199 in 34880 ms
Computed kernel to row 1299 in 34810 ms
Computed kernel to row 1399 in 34485 ms
Computed kernel to row 1499 in 35410 ms
Computed kernel to row 1599 in 35269 ms
Computed kernel to row 1699 in 35588 ms
Computed kernel to row 1799 in 35456 ms
Computed kernel to row 1899 in 36286 ms
Computed kernel to row 1999 in 36769 ms
Computed kernel to row 2099 in 34673 ms
Computed kernel to r

0.5195345785945145