In [20]:
from pathlib import Path
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [4]:
BASE_DIR = Path.cwd().parent

In [5]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [6]:
df_hosts_reviews_en_labeled['neighbourhood'] = df_hosts_reviews_en_labeled['neighbourhood'].str.lower()
df = df_hosts_reviews_en_labeled[['comments', 'neighbourhood']]
remove_neighb = 'enskede|årsta|vantörs|östermalm|norrmalm|\
                    kungsholm|skarpnäck|södermalm|skärholmen|\
                    bromma|hägersten|farsta|älvsjö|hässelby|\
                    vällingby|rinkeby|tensta|spånga|'
df.loc[:, 'comments'] = df['comments'].str.replace(remove_neighb, '', regex=True)

In [7]:
df_extended = df_hosts_reviews_en_labeled[['comments', 'neighbourhood', 
                                           'price', 'number_of_reviews', 'reviews_per_month',
                                          'number_of_reviews_ltm', 'room_type']]
df_extended = pd.get_dummies(df_extended, columns=['room_type'], drop_first=True)
cols = ['room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room']
df_extended[cols] = df_extended[cols].astype(int)

In [8]:
def split_data(df, labels, add_features='n'):
    df_model = df[df['neighbourhood'].isin(labels) == True]
    train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42)
    vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
    X_train = vectorizer.fit_transform(train_df['comments'])
    X_test = vectorizer.transform(test_df['comments'])

    if add_features == 'y':
        scaler = MinMaxScaler()
        train_price_scaled = scaler.fit_transform(train_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
        test_price_scaled = scaler.transform(test_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
    
        # Concatenate the features
        X_train = np.hstack((X_train.toarray(), train_price_scaled))
        X_test = np.hstack((X_test.toarray(), test_price_scaled))

    y_train = train_df['neighbourhood']
    y_test = test_df['neighbourhood']
    return X_train, X_test, y_train, y_test

In [9]:
def classifier_model(df, labels, classifier_, model_id, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
 
    # Train the classifier
    classifier = classifier_
    classifier.fit(X_train, y_train)
    
    # save model
    dump(classifier, f'saved_models/model_{classifier_}_{model_id}.joblib')

In [10]:
def evaluate_model(df, labels, model_name, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
    classifier = load(BASE_DIR / 'models/saved_models' / f'{model_name}.joblib')

    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=labels))

    cm = confusion_matrix(y_test, y_pred)
    fig = px.imshow(cm, 
                x=labels, y=labels, 
                text_auto=True,
                color_continuous_scale='Blues'
                )
    fig.update_layout(title='Confusion Matrix',
                    xaxis=dict(title='Predicted Label'),
                    yaxis=dict(title='True Label'))
    fig.show()

In [18]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '2n')
evaluate_model(df, labels, 'model_MultinomialNB()_2n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.74      0.67      0.70       801
           östermalm       0.74      0.80      0.77       950

            accuracy                           0.74      1751
           macro avg       0.74      0.73      0.74      1751
        weighted avg       0.74      0.74      0.74      1751



In [19]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '4n')
evaluate_model(df, labels, 'model_MultinomialNB()_4n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.69      0.32      0.43       821
         kungsholmen       0.46      0.67      0.55      1661
            norrmalm       0.46      0.58      0.51      1473
           östermalm       0.56      0.13      0.21       875

            accuracy                           0.48      4830
           macro avg       0.54      0.42      0.42      4830
        weighted avg       0.52      0.48      0.45      4830



In [20]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '5n')
evaluate_model(df, labels, 'model_MultinomialNB()_5n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.79      0.07      0.13       838
         kungsholmen       0.65      0.09      0.16      1620
            norrmalm       0.78      0.08      0.14      1485
           södermalm       0.58      0.99      0.73      6368
           östermalm       1.00      0.01      0.01       990

            accuracy                           0.59     11301
           macro avg       0.76      0.25      0.24     11301
        weighted avg       0.67      0.59      0.47     11301



In [21]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '2y', add_features='y')

In [22]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.88      0.95      0.91       801
           östermalm       0.95      0.89      0.92       950

            accuracy                           0.92      1751
           macro avg       0.92      0.92      0.92      1751
        weighted avg       0.92      0.92      0.92      1751



In [23]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '4y', add_features='y')

In [24]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_4y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.79      0.71      0.74       821
         kungsholmen       0.86      0.85      0.85      1661
            norrmalm       0.76      0.87      0.81      1473
           östermalm       0.90      0.79      0.84       875

            accuracy                           0.82      4830
           macro avg       0.83      0.80      0.81      4830
        weighted avg       0.82      0.82      0.82      4830



In [25]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '5y', add_features='y')

In [26]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_5y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.79      0.61      0.69       838
         kungsholmen       0.92      0.67      0.78      1620
            norrmalm       0.97      0.55      0.70      1485
           södermalm       0.76      0.98      0.86      6368
           östermalm       1.00      0.42      0.59       990

            accuracy                           0.81     11301
           macro avg       0.89      0.65      0.72     11301
        weighted avg       0.84      0.81      0.79     11301



In [16]:
labels = labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, LogisticRegression(), '2y', add_features='y')

In [17]:
evaluate_model(df_extended, labels, 'model_LogisticRegression()_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.76      0.77      0.77       801
           östermalm       0.81      0.79      0.80       950

            accuracy                           0.78      1751
           macro avg       0.78      0.78      0.78      1751
        weighted avg       0.79      0.78      0.78      1751



In [18]:
labels = labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, LogisticRegression(), '5y', add_features='y')

In [19]:
evaluate_model(df_extended, labels, 'model_LogisticRegression()_5y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.56      0.33      0.41       838
         kungsholmen       0.48      0.28      0.36      1620
            norrmalm       0.56      0.29      0.38      1485
           södermalm       0.67      0.93      0.78      6368
           östermalm       0.58      0.13      0.21       990

            accuracy                           0.64     11301
           macro avg       0.57      0.39      0.43     11301
        weighted avg       0.61      0.64      0.59     11301



In [21]:
labels = labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, RandomForestClassifier(max_depth=2, random_state=42), '2y', add_features='y')

In [23]:
evaluate_model(df_extended, labels, 'model_RandomForestClassifier(max_depth=2, random_state=42)_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.90      0.17      0.29       801
           östermalm       0.58      0.98      0.73       950

            accuracy                           0.61      1751
           macro avg       0.74      0.58      0.51      1751
        weighted avg       0.73      0.61      0.53      1751

