In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import pandas as pd
import numpy as np
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

import xgboost as xgb
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, NuSVR
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge

import joblib

In [5]:
training_data = pd.read_csv('./csv_files/training_no_emoji.csv')
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv')

In [6]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [7]:
count_vect = TfidfVectorizer(stop_words= stopwords.words('german'), ngram_range=(3, 5), analyzer='char', max_features=9000)

In [8]:
data_train, data_test, values_train, values_test = train_test_split(training_data['Text'], training_data['Lat'], test_size=0.2, random_state = 0)

In [9]:
data_train = filter_texts(data_train)
data_test = filter_texts(data_test)

In [10]:
X_train = count_vect.fit_transform(data_train)
X_test = count_vect.transform(data_test)

In [24]:
svr = SVR(cache_size = 1500, max_iter = 10**8)
nuSvr = NuSVR(C = 10, cache_size = 1500, max_iter = 10**8)
kernRidge = KernelRidge()
bayReg = BayesianRidge(n_iter=10 ** 8)
linReg = LinearRegression()
knnReg = KNeighborsRegressor(n_neighbors=3)
randForReg = RandomForestRegressor(max_depth=25, n_estimators=100 ,n_jobs=-1, random_state=0, criterion='mse')

0.5212929939015993
0.5207256864106871

In [None]:
bayReg.fit(X_train.toarray(), values_train)
predict_Bay = bayReg.predict(X_test.toarray())
print(MAE(predict_Bay, values_test))

0.5216377969787024


In [None]:
linReg.fit(X_train, values_train)
predict_lin = linReg.predict(X_test)
print(MAE(predict_lin, values_test))

1.4672372959642161


In [25]:
randForReg.fit(X_train, values_train)
predict_rfg = randForReg.predict(X_test)
print(MAE(predict_rfg, values_test))

0.5314255242783532


In [20]:
randForReg.fit(X_train[:2000], values_train[:2000])
predict_rfg = randForReg.predict(X_test[:2000])
print(MAE(predict_rfg, values_test[:2000]))

0.6252749846924682


In [None]:
data_dmatrix = xgb.DMatrix(data=X_train, label=values_train)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 20, alpha = 10, n_estimators = 100)

In [None]:
xg_reg.fit(X_train, values_train)

XGBRegressor(alpha=5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
MAE(preds, values_test)

0.6062570012740012