# Kategorinen data ja puuttuvien arvojen käsittely

In [None]:
import pandas as pd 
import numpy as np 

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import os 

In [None]:
# luetaan data ja tallennetaan se csv-tiedostoon
if os.path.exists('abnd_listings.csv'):
    df = pd.read_csv('abnd_listings.csv')
else:
    df = pd.read_csv('https://raw.githubusercontent.com/InfoTUNI/joda2022/master/koodiesimerkit/data.csv')
    df.to_csv('abnd_listings.csv')

df.info()

In [None]:
df_no_missing = df[['host_response_time', 'host_response_rate', 'review_scores_rating']].copy()

print(df_no_missing.head())
print(df_no_missing.host_response_time.unique())

In [None]:
df_no_missing.host_response_rate = df_no_missing.host_response_rate.str.strip('%')
df_no_missing.host_response_rate = pd.to_numeric(df_no_missing.host_response_rate)

print(df_no_missing.info, '\n')
print(df_no_missing.dtypes, '\n')
print(df_no_missing.head(), '\n')

null_counts = df_no_missing.isnull().sum()
print(f"Null values count: \n {null_counts}")

In [None]:
# Pudotetaan rivit, joissa on puuttuvia arvoja
df_no_missing = df_no_missing.dropna()
df_no_missing.info()

In [None]:
# Kategorisoidaan host_response_time -sarake
le = preprocessing.LabelEncoder()
arr = le.fit_transform(df_no_missing.host_response_time)
df_no_missing.host_response_time = arr 

In [None]:
print(arr)
df_no_missing.head()

In [None]:
# Käytetään lineaarista mallia, joka ennustaa review_scores_rating -sarakeen arvoja.
lr = linear_model.LinearRegression()
y = df_no_missing.review_scores_rating
X = df_no_missing.drop(columns='review_scores_rating')


In [None]:
predictions = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots(figsize = (20,10))

ax.scatter(y, predictions, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
print(df_no_missing.corr())


In [None]:
mse = mean_squared_error(predictions, y)
mae = mean_absolute_error(predictions, y)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Toinen lähestymistapa - Täytetään tyhjät arvot mediaanilla 

In [None]:
df_imp = df[['review_scores_accuracy','review_scores_cleanliness',
         'review_scores_checkin','review_scores_communication',
         'review_scores_location','review_scores_value',
         'review_scores_rating']].copy()

print(df_imp.isnull().sum())


In [None]:
# Tarkastellaan otettujen muuttujien mediaaneja
df_imp.median()

In [None]:
# Lisätään tyhjiin arvioihin kyseiisten sarakkeiden mediaaniarvot
df_imp = df_imp.fillna(df_imp.median())
print(df_imp.isnull().sum())

# Ennustetaan lineaarisella mallilla arvostelut jossa tyhjät arvot korvattu kyseisen sarakkeen mediaanilla

In [None]:
# Käytetään lineaarista mallia, joka ennustaa review_scores_rating -sarakeen arvoja.
lr = linear_model.LinearRegression()

y = df_imp.review_scores_rating
X = df_imp.drop(columns='review_scores_rating')

# Erotetaan testi ja opetusdata toisistaan
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

In [None]:
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
print(df_imp.corr()['review_scores_rating'])

In [None]:
# Tarkastellaan lineaarisen mallin virheitä 

mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)

# Pidetään tällä yllä eri mallien suorituskykyä
models_performance = pd.DataFrame({'Model Name': ['Linear Regression'], 'MSE': [mse], 'MAE': [mae]})


# Käytetään satunnaista päätöspuumetsää ja optimoidaan hyperparametrit satunnaisella haulla 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

# Määritellään hyperparametrien etsintäavaruus
param_space = {
    'n_estimators': np.arange(1, 500, 10),
    'max_depth': np.arange(3, 11),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 6),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Haetaan parhaat hyperparametrit
random_search = RandomizedSearchCV(
estimator = rf,
param_distributions = param_space,
cv = 5,
n_jobs= -2,
n_iter = 100,
verbose = 0
)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Käytetään parhaita hyperparametrejä
predictions = best_model.predict(X_test)

# Tarkastellaan mallin toimivuutta
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)

# Tallennetaan mallin suorituskyky
models_performance = pd.concat([models_performance, pd.DataFrame({'Model Name': ['RandomForestRegressor'], 'MSE': [mse], 'MAE': [mae]})], ignore_index=True)


# Testataan XGBoost mallia ja etsitään sillekin hyperparametrit 

In [None]:
import xgboost

# Käyetään xgboost -mallia MSE objektiivilla
xgb = xgboost.XGBRegressor(objective ='reg:squarederror')

param_space = {
    'n_estimators': np.arange(1, 500, 10),
    'max_depth': np.arange(3, 11),
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2] 
}

random_search = RandomizedSearchCV(
estimator = xgb,
param_distributions = param_space,
cv = 5,
n_jobs= -2,
n_iter = 100,
verbose = 0
)


random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Käytetään parhaita hyperparametrejä
predictions = best_model.predict(X_test)

# Tarkastellaan mallin toimivuutta
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Tallennetaan saadut arvot
models_performance = pd.concat([models_performance, pd.DataFrame({'Model Name': ['XGBoost'], 'MSE': [mse], 'MAE': [mae]})], ignore_index=True)

# Optimoidaan NN hyperparametrit ensin satunnaisella haulla

In [None]:
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from kerastuner import RandomSearch
from tensorflow.keras.callbacks import ModelCheckpoint

models = []

# Standardoi ominaisuudet
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    # Määritetään hakuavaruus neuroneiden määrälle 
    model.add(keras.layers.Dense(units=hp.Int('units_0', min_value=1, max_value=64, step=4), activation='relu'))
    model.add(keras.layers.Dense(units=hp.Int('units_1', min_value=1, max_value=64, step=4), activation='relu'))        
    model.add(keras.layers.Dense(1, activation='linear'))

    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])),
                  loss='mse')
    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    directory='NN_search',
    project_name='kt_random')

tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

# Hae parasta mallia ja sen hyperparametrit ja opeta verkko näillä uudestaan pidemmällä koulutuksella
best_hyperparameters = tuner.get_best_hyperparameters()[0]
print("Parhaat hyperparametrit:\n", best_hyperparameters.values)

model_random = build_model(best_hyperparameters)

checkpoint_filepath = './NN_search/kt_random/best_model_weights.weights.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

model_random.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test), callbacks=[model_checkpoint_callback])
model_random.load_weights(checkpoint_filepath)

models.append(model_random)

predictions = model_random.predict(X_test_scaled)
mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)

print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Tallennetaan saadut arvot
models_performance = pd.concat([models_performance, pd.DataFrame({'Model Name': ['Neural Network RandomGrid Search'], 'MSE': [mse], 'MAE': [mae]})], ignore_index=True)





# Optimoidaan NN hyperparametrit käyttäen Bayesilaista optimointia

In [None]:
from keras_tuner import BayesianOptimization

def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    # Määritetään hakuavaruus neuroneiden määrälle
    model.add(keras.layers.Dense(units=hp.Int('units_0', min_value=1, max_value=64, step=4), activation='relu'))
    model.add(keras.layers.Dense(units=hp.Int('units_1', min_value=1, max_value=64, step=4), activation='relu'))        
    model.add(keras.layers.Dense(1, activation='linear'))

    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])),
                  loss='mse')
    return model


tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    directory='NN_search',
    project_name='intro_kt_bayesian')

tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

# Hae parasta mallia ja sen hyperparametrit ja opeta verkko näillä uudestaan pidemmällä koulutuksella
best_hyperparameters = tuner.get_best_hyperparameters()[0]
print("Parhaat hyperparametrit:\n", best_hyperparameters.values)

model_bayes = build_model(best_hyperparameters)

checkpoint_filepath = './NN_search/kt_bayesian/best_model_weights.weights.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

model_bayes.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test), callbacks=[model_checkpoint_callback])
model_bayes.load_weights(checkpoint_filepath)
models.append(model_bayes)

predictions = model_bayes.predict(X_test_scaled)
mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)

print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Tallennetaan saadut arvot
models_performance = pd.concat([models_performance, pd.DataFrame({'Model Name': ['Neural Network Bayes Optimization Search'], 'MSE': [mse], 'MAE': [mae]})], ignore_index=True)


# Otetaan neuroverkosta viimeinen kerros pois ja testataan tällä XGBoostia näin tulleihin ominaisuusvektoriin ja toteutetaan hypeparametrien satunnaishaku

In [None]:
model_names = ['RandomGrid Search', 'Bayes Optimization Search']
for idx, model in enumerate (models):
    print(model)

    feature_extractor = tf.keras.Model(inputs=model.inputs, outputs=model.layers[-2].output)
    X_train_features = feature_extractor.predict(X_train_scaled)
    X_test_features = feature_extractor.predict(X_test_scaled)


    xgb = xgboost.XGBRegressor(objective ='reg:squarederror')

    param_space = {
        'n_estimators': np.arange(1, 500, 10),
        'max_depth': np.arange(3, 11),
        'learning_rate': [0.1, 0.01, 0.001],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 1, 5],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [1, 1.5, 2] 
    }

    random_search = RandomizedSearchCV(
    estimator = xgb,
    param_distributions = param_space,
    cv = 5,
    n_jobs= -2,
    n_iter = 1,
    verbose = 0
    )


    random_search.fit(X_train_features, y_train)
    best_model = random_search.best_estimator_

    # Käytetään parhaita hyperparametrejä
    predictions =best_model.predict(X_test_features)

    # Tarkastellaan mallin toimivuutta
    fig, ax = plt.subplots(figsize = (20,10))
    ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()

    mse = mean_squared_error(predictions, y_test)
    mae = mean_absolute_error(predictions, y_test)
    print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

    # Tallennetaan saadut arvot
    models_performance = pd.concat([models_performance, pd.DataFrame({'Model Name': [f'XGBoost with Neural Network {model_names[idx]}'], 'MSE': [mse], 'MAE': [mae]})], ignore_index=True)


In [None]:
import seaborn as sns

# MSE:n visualisointi eri malleille
plt.figure(figsize=(10, 6))
sns.barplot(x='Model Name', y='MSE', data=models_performance)
plt.title('MSE Comparison among Different Models')
plt.ylabel('Mean Squared Error (MSE)')
plt.xlabel('Model Name')
plt.xticks(rotation=45)
plt.show()

print(models_performance)
