# Kategorinen data ja puuttuvien arvojen käsittely

In [None]:
import pandas as pd 
import numpy as np 

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import os 

In [None]:
# luetaan data ja tallennetaan se csv-tiedostoon
if os.path.exists('abnd_listings.csv'):
    df = pd.read_csv('abnd_listings.csv')
else:
    df = pd.read_csv('https://raw.githubusercontent.com/InfoTUNI/joda2022/master/koodiesimerkit/data.csv')
    df.to_csv('abnd_listings.csv')

df.info()

In [None]:
df_no_missing = df[['host_response_time', 'host_response_rate', 'review_scores_rating']].copy()

print(df_no_missing.head())
print(df_no_missing.host_response_time.unique())

In [None]:
df_no_missing.host_response_rate = df_no_missing.host_response_rate.str.strip('%')
df_no_missing.host_response_rate = pd.to_numeric(df_no_missing.host_response_rate)

print(df_no_missing.info, '\n')
print(df_no_missing.dtypes, '\n')
print(df_no_missing.head(), '\n')

null_counts = df_no_missing.isnull().sum()
print(f"Null values count: \n {null_counts}")

In [None]:
# Pudotetaan rivit, joissa on puuttuvia arvoja
df_no_missing = df_no_missing.dropna()
df_no_missing.info()

In [None]:
# Kategorisoidaan host_response_time -sarake
le = preprocessing.LabelEncoder()
arr = le.fit_transform(df_no_missing.host_response_time)
df_no_missing.host_response_time = arr 

In [None]:
print(arr)
df_no_missing.head()

In [None]:
# Käytetään lineaarista mallia, joka ennustaa review_scores_rating -sarakeen arvoja.
lr = linear_model.LinearRegression()
y = df_no_missing.review_scores_rating
X = df_no_missing.drop(columns='review_scores_rating')


In [None]:
predictions = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots(figsize = (20,10))

ax.scatter(y, predictions, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
print(df_no_missing.corr())


In [None]:
mse = mean_squared_error(predictions, y)
mae = mean_absolute_error(predictions, y)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Toinen lähestymistapa - Täytetään tyhjät arvot mediaanilla 

In [None]:
df_imp = df[['review_scores_accuracy','review_scores_cleanliness',
         'review_scores_checkin','review_scores_communication',
         'review_scores_location','review_scores_value',
         'review_scores_rating']].copy()

print(df_imp.isnull().sum())


In [None]:
# Tarkastellaan otettujen muuttujien mediaaneja
df_imp.median()

In [None]:
# Lisätään tyhjiin arvioihin kyseiisten sarakkeiden mediaaniarvot
df_imp = df_imp.fillna(df_imp.median())
print(df_imp.isnull().sum())

# Ennustetaan lineaarisella mallilla arvostelut jossa tyhjät arvot korvattu kyseisen sarakkeen mediaanilla

In [None]:
# Käytetään lineaarista mallia, joka ennustaa review_scores_rating -sarakeen arvoja.

lr = linear_model.LinearRegression()

y = df_imp.review_scores_rating
X = df_imp.drop(columns='review_scores_rating')

predictions = cross_val_predict(lr, X, y, cv=10)

In [None]:
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y, predictions, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
print(df_imp.corr()['review_scores_rating'])

In [None]:
# Tarkastellaan lineaarisen mallin virheitä 

mse = mean_squared_error(predictions, y)
mae = mean_absolute_error(predictions, y)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Käytetään satunnaista päätöspuumetsää ja optimoidaan hyperparametrit satunnaisella haulla 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()
y = df_imp.review_scores_rating
X = df_imp.drop(columns='review_scores_rating')

# Määritellään hyperparametrien etsintäavaruus
param_space = {
    'n_estimators': np.arange(1, 500, 10),
    'max_depth': np.arange(3, 11),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 6),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Haetaan parhaat hyperparametrit
random_search = RandomizedSearchCV(
estimator = rf,
param_distributions = param_space,
cv = 5,
n_jobs= -2,
n_iter = 1000,
verbose = 0
)

random_search.fit(X, y)
best_model = random_search.best_estimator_

# Käytetään parhaita hyperparametrejä
predictions = best_model.predict(X)

# Tarkastellaan mallin toimivuutta
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y, predictions, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

mse = mean_squared_error(predictions, y)
mae = mean_absolute_error(predictions, y)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}")

# Käytetään satunnaista päätöspuumetsää ennustamisessa. Käytetään hyperparametrien optimointiin HalvingRandomSearchCV hakua

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

rf = RandomForestRegressor()
y = df_imp.review_scores_rating
X = df_imp.drop(columns='review_scores_rating')

# Määritellään hyperparametrien etsintäavaruus
param_space = {
    'n_estimators': np.arange(1, 500, 100),
    'max_depth': np.arange(3, 11),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 6),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Haetaan parhaat hyperparametrit
halving_random_search = HalvingRandomSearchCV(
estimator = rf,
param_distributions = param_space,
cv = 5,
n_jobs=-2,
aggressive_elimination = True,
factor = 2,
verbose = 0
)

halving_random_search.fit(X, y)
best_model = halving_random_search.best_estimator_

# Käytetään parhaita hyperparametrejä
predictions = best_model.predict(X)

# Tarkastellaan mallin toimivuutta
fig, ax = plt.subplots(figsize = (20,10))
ax.scatter(y, predictions, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

mse = mean_squared_error(predictions, y)
mae = mean_absolute_error(predictions, y)
print(f"Mean squared error: {mse:.2f}\nMean absolute erro: {mae:.2f}")



