In [18]:
from pathlib import Path
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [19]:
BASE_DIR = Path.cwd().parent

In [20]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [21]:
df_hosts_reviews_en_labeled['neighbourhood'] = df_hosts_reviews_en_labeled['neighbourhood'].str.lower()
df = df_hosts_reviews_en_labeled[['comments', 'neighbourhood']]
remove_neighb = 'enskede|årsta|vantörs|östermalm|norrmalm|\
                    kungsholm|skarpnäck|södermalm|skärholmen|\
                    bromma|hägersten|farsta|älvsjö|hässelby|\
                    vällingby|rinkeby|tensta|spånga|'
df.loc[:, 'comments'] = df['comments'].str.replace(remove_neighb, '', regex=True)

In [22]:
df_extended = df_hosts_reviews_en_labeled[['comments', 'neighbourhood', 
                                           'price', 'number_of_reviews', 'reviews_per_month',
                                          'number_of_reviews_ltm', 'room_type']]
df_extended = pd.get_dummies(df_extended, columns=['room_type'], drop_first=True)
df_extended[['room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room']] = df_extended[['room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room']].astype(int)

In [23]:
def split_data(df, labels, add_features='n'):
    df_model = df[df['neighbourhood'].isin(labels) == True]
    train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42)
    vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
    X_train = vectorizer.fit_transform(train_df['comments'])
    X_test = vectorizer.transform(test_df['comments'])

    if add_features == 'y':
        scaler = MinMaxScaler()
        train_price_scaled = scaler.fit_transform(train_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
        test_price_scaled = scaler.transform(test_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
    
        # Concatenate the features
        X_train = np.hstack((X_train.toarray(), train_price_scaled))
        X_test = np.hstack((X_test.toarray(), test_price_scaled))

    y_train = train_df['neighbourhood']
    y_test = test_df['neighbourhood']
    return X_train, X_test, y_train, y_test

In [24]:
def classifier_model(df, labels, classifier_, model_id, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
 
    # Train the classifier
    classifier = classifier_
    classifier.fit(X_train, y_train)
    
    # save model
    dump(classifier, f'saved_models/model_{classifier_}_{model_id}.joblib')

In [25]:
def evaluate_model(df, labels, model_name, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
    classifier = load(BASE_DIR / 'models/saved_models' / f'{model_name}.joblib')

   # Test the classifier and evaluate its accuracy
    accuracy = classifier.score(X_test, y_test).round(2)
    print("Accuracy:", accuracy)
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    fig = px.imshow(cm, 
                x=labels, y=labels, 
                text_auto=True,
                color_continuous_scale='Blues'
                )
    fig.update_layout(title='Confusion Matrix',
                    xaxis=dict(title='Predicted Label'),
                    yaxis=dict(title='True Label'))
    fig.show()
    # y_true and y_pred are arrays of shape (n_samples, n_classes)
    # where n_samples is the number of samples and n_classes is the number of classes
       
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macor')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"Precision:, {precision: 0.2f}")
    print(f"Recall:, {recall: 0.2f}")
    print(f"F1-score:, {f1: 0.2f}")
    print()
    cm_ = multilabel_confusion_matrix(y_test, y_pred)

    for i, label in enumerate(labels):
        tn, fp, fn, tp = cm_[i].ravel()
        print("Label:", label)
        print("True Negative:", tn)
        print("False Positive:", fp)
        print("False Negative:", fn)
        print("True Positive:", tp)

In [26]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '2n')
evaluate_model(df, labels, 'model_MultinomialNB()_2n')

Accuracy: 0.74


Precision:,  0.74
Recall:,  0.74
F1-score:,  0.74

Label: enskede-årsta-vantör
True Negative: 762
False Positive: 188
False Negative: 267
True Positive: 534
Label: östermalm
True Negative: 534
False Positive: 267
False Negative: 188
True Positive: 762


In [27]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '4n')
evaluate_model(df, labels, 'model_MultinomialNB()_4n')

Accuracy: 0.48


Precision:,  0.52
Recall:,  0.48
F1-score:,  0.45

Label: enskede-årsta-vantör
True Negative: 3889
False Positive: 120
False Negative: 560
True Positive: 261
Label: kungsholmen
True Negative: 1879
False Positive: 1290
False Negative: 556
True Positive: 1105
Label: norrmalm
True Negative: 2351
False Positive: 1006
False Negative: 622
True Positive: 851
Label: östermalm
True Negative: 3869
False Positive: 86
False Negative: 764
True Positive: 111


In [28]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '5n')
evaluate_model(df, labels, 'model_MultinomialNB()_5n')

Accuracy: 0.59


Precision:,  0.67
Recall:,  0.59
F1-score:,  0.47

Label: enskede-årsta-vantör
True Negative: 10447
False Positive: 16
False Negative: 777
True Positive: 61
Label: kungsholmen
True Negative: 9600
False Positive: 81
False Negative: 1478
True Positive: 142
Label: norrmalm
True Negative: 9783
False Positive: 33
False Negative: 1369
True Positive: 116
Label: södermalm
True Negative: 410
False Positive: 4523
False Negative: 45
True Positive: 6323
Label: östermalm
True Negative: 10311
False Positive: 0
False Negative: 984
True Positive: 6


In [29]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '2y', add_features='y')

In [30]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_2y', add_features='y')

Accuracy: 0.93


Precision:,  0.93
Recall:,  0.93
F1-score:,  0.93

Label: enskede-årsta-vantör
True Negative: 864
False Positive: 86
False Negative: 36
True Positive: 765
Label: östermalm
True Negative: 765
False Positive: 36
False Negative: 86
True Positive: 864


In [31]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '4y', add_features='y')

In [32]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_4y', add_features='y')

Accuracy: 0.84


Precision:,  0.84
Recall:,  0.84
F1-score:,  0.84

Label: enskede-årsta-vantör
True Negative: 3865
False Positive: 144
False Negative: 187
True Positive: 634
Label: kungsholmen
True Negative: 2997
False Positive: 172
False Negative: 279
True Positive: 1382
Label: norrmalm
True Negative: 2955
False Positive: 402
False Negative: 141
True Positive: 1332
Label: östermalm
True Negative: 3891
False Positive: 64
False Negative: 175
True Positive: 700


In [33]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '5y', add_features='y')

In [36]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_5y', add_features='y')

Accuracy: 0.81


Precision:,  0.83
Recall:,  0.81
F1-score:,  0.79

Label: enskede-årsta-vantör
True Negative: 10332
False Positive: 131
False Negative: 351
True Positive: 487
Label: kungsholmen
True Negative: 9552
False Positive: 129
False Negative: 442
True Positive: 1178
Label: norrmalm
True Negative: 9783
False Positive: 33
False Negative: 680
True Positive: 805
Label: södermalm
True Negative: 3044
False Positive: 1889
False Negative: 146
True Positive: 6222
Label: östermalm
True Negative: 10304
False Positive: 7
False Negative: 570
True Positive: 420
