In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import pandas as pd
import numpy as np
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

import xgboost as xgb
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, NuSVR
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor

import joblib

In [5]:
training_data = pd.read_csv('./csv_files/training_no_emoji.csv')
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv')

In [6]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [14]:
count_vect = TfidfVectorizer(stop_words= stopwords.words('german'), ngram_range=(3, 5), analyzer='char', max_features=30000)

In [8]:
data_train, data_test, values_train, values_test = train_test_split(training_data['Text'], training_data['Lat'], test_size=0.2, random_state = 0)

In [9]:
data_train = filter_texts(data_train)
data_test = filter_texts(data_test)

In [15]:
X_train = count_vect.fit_transform(data_train)
X_test = count_vect.transform(data_test)

In [31]:
svr = SVR(cache_size = 1500, max_iter = 10**8)
nuSvr = NuSVR(C = 10, cache_size = 1500, max_iter = 10**8)
kernRidge = KernelRidge()
bayReg = BayesianRidge(n_iter=10 ** 7)
linReg = LinearRegression()
knnReg = KNeighborsRegressor(n_neighbors=6, weights='distance')
randForReg = RandomForestRegressor(max_depth=25, n_jobs=2, random_state=0, criterion='mae')
GradientBoostingRegressor(n_iter_no_change=5, learning_rate=0.1, n_estimators=250, max_depth=40, random_state=0, alpha=0.8, ccp_alpha=0.0, max_features='auto', loss='huber')

GradientBoostingRegressor(alpha=0.8, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=40, max_features='auto',
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=250, n_iter_no_change=5,
                          presort='deprecated', random_state=0, subsample=1.0,
                          tol=0.0001, validation_fraction=0.1, verbose=0,
                          warm_start=False)

In [32]:
gdR.fit(X_train, values_train)

      Iter       Train Loss   Remaining Time 
         1           0.3490           22.93m
         2           0.3400           21.68m
         3           0.3299           21.32m
         4           0.3172           21.06m
         5           0.3116           20.97m
         6           0.3051           20.85m
         7           0.2971           20.69m
         8           0.2919           20.60m
         9           0.2879           20.49m
        10           0.2839           20.34m
        20           0.2531           19.57m
        30           0.2366           18.94m
        40           0.2257           18.32m
        50           0.2179           17.69m
        60           0.2114           17.07m
        70           0.2059           16.48m
        80           0.2012           15.88m
        90           0.1970           15.28m
       100           0.1934           14.67m
       200           0.1687            8.77m


GradientBoostingRegressor(alpha=0.8, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=350,
                          n_iter_no_change=5, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [33]:
predict = gdR.predict(X_test)
MAE(predict, values_test)

0.5152478252937385

0.5178430245886826

In [None]:
bayReg.fit(X_train.toarray(), values_train)
predict_Bay = bayReg.predict(X_test.toarray())
print(MAE(predict_Bay, values_test))

In [None]:
knnReg.fit(X_train, values_train)
predict_Bay = knnReg.predict(X_test)
print(MAE(predict_Bay, values_test))

0.6108969004714375


In [None]:
linReg.fit(X_train, values_train)
predict_lin = linReg.predict(X_test)
print(MAE(predict_lin, values_test))

1.1442384920245439


In [None]:
randForReg.fit(X_train, values_train)
predict_rfg = randForReg.predict(X_test)
print(MAE(predict_rfg, values_test))

In [None]:
data_dmatrix = xgb.DMatrix(data=X_train, label=values_train)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 20, alpha = 10, n_estimators = 100)

In [None]:
xg_reg.fit(X_train, values_train)

XGBRegressor(alpha=5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
MAE(preds, values_test)

0.6062570012740012