In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")



In [2]:
BASE_DIR = Path.cwd().parent

In [3]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [4]:
df_hosts_reviews_en_labeled['neighbourhood'] = df_hosts_reviews_en_labeled['neighbourhood'].str.lower()
df = df_hosts_reviews_en_labeled[['comments', 'neighbourhood']]
remove_neighb = 'enskede|årsta|vantörs|östermalm|norrmalm|\
                    kungsholm|skarpnäck|södermalm|skärholmen|\
                    bromma|hägersten|liljeholmen|farsta|älvsjö|hässelby|\
                    vällingby|rinkeby|tensta|spånga|'
df.loc[:, 'comments'] = df['comments'].str.replace(remove_neighb, '', regex=True)

In [5]:
df_extended = df_hosts_reviews_en_labeled[['comments', 'neighbourhood', 
                                           'price', 'number_of_reviews', 'reviews_per_month',
                                          'number_of_reviews_ltm', 'room_type']]
df_extended = pd.get_dummies(df_extended, columns=['room_type'], drop_first=True)
cols = ['room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room']
df_extended[cols] = df_extended[cols].astype(int)

In [6]:
def split_data(df, labels, add_features='n'):
    df_model = df[df['neighbourhood'].isin(labels) == True]
    y = df_model['neighbourhood']
    train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=y)
    vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
    X_train = vectorizer.fit_transform(train_df['comments'])
    X_test = vectorizer.transform(test_df['comments'])

    if add_features == 'y':
        scaler = MinMaxScaler()
        train_price_scaled = scaler.fit_transform(train_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
        test_price_scaled = scaler.transform(test_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
    
        # Concatenate the features
        X_train = np.hstack((X_train.toarray(), train_price_scaled))
        X_test = np.hstack((X_test.toarray(), test_price_scaled))

    y_train = train_df['neighbourhood']
    y_test = test_df['neighbourhood']
    return X_train, X_test, y_train, y_test

In [7]:
def classifier_model(df, labels, classifier_, model_id, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
 
    # Train the classifier
    classifier = classifier_
    classifier.fit(X_train, y_train)
    
    # save model
    dump(classifier, f'saved_models/model_{classifier_}_{model_id}.joblib')

In [8]:
def evaluate_model(df, labels, model_name, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
    classifier = load(BASE_DIR / 'models/saved_models' / f'{model_name}.joblib')

    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=labels))

    cm = confusion_matrix(y_test, y_pred)
    fig = px.imshow(cm, 
                x=labels, y=labels, 
                text_auto=True,
                color_continuous_scale='Blues'
                )
    fig.update_layout(title='Confusion Matrix',
                    xaxis=dict(title='Predicted Label'),
                    yaxis=dict(title='True Label'))
    fig.show()

In [9]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '2n')
evaluate_model(df, labels, 'model_MultinomialNB()_2n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.75      0.64      0.69       827
           östermalm       0.72      0.81      0.76       924

            accuracy                           0.73      1751
           macro avg       0.73      0.73      0.73      1751
        weighted avg       0.73      0.73      0.73      1751



In [10]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, MultinomialNB(), '6y', add_features='y')
evaluate_model(df_extended, labels, 'model_MultinomialNB()_6y', add_features='y')

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.68      0.12      0.20       827
hägersten-liljeholmen       0.77      0.03      0.05       749
          kungsholmen       0.46      0.13      0.20      1588
             norrmalm       0.78      0.21      0.33      1491
            södermalm       0.58      0.99      0.73      6471
            östermalm       1.00      0.01      0.02       924

             accuracy                           0.58     12050
            macro avg       0.71      0.25      0.26     12050
         weighted avg       0.64      0.58      0.48     12050



In [12]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '2y', add_features='y')

In [13]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.92      0.93      0.93       827
           östermalm       0.94      0.93      0.94       924

            accuracy                           0.93      1751
           macro avg       0.93      0.93      0.93      1751
        weighted avg       0.93      0.93      0.93      1751



In [20]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '6y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.79      0.61      0.69       838
         kungsholmen       0.92      0.67      0.78      1620
            norrmalm       0.97      0.55      0.70      1485
           södermalm       0.76      0.98      0.86      6368
           östermalm       1.00      0.42      0.59       990

            accuracy                           0.81     11301
           macro avg       0.89      0.65      0.72     11301
        weighted avg       0.84      0.81      0.79     11301

In [14]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_6y', add_features='y')

ValueError: X has 2848 features, but GradientBoostingClassifier is expecting 14743 features as input.

In [15]:
labels = labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, LogisticRegression(), '2y', add_features='y')

In [16]:
evaluate_model(df_extended, labels, 'model_LogisticRegression()_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.76      0.75      0.75       827
           östermalm       0.78      0.79      0.78       924

            accuracy                           0.77      1751
           macro avg       0.77      0.77      0.77      1751
        weighted avg       0.77      0.77      0.77      1751



In [17]:
labels = labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, LogisticRegression(), '6y', add_features='y')

In [18]:
evaluate_model(df_extended, labels, 'model_LogisticRegression()_6y', add_features='y')

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.48      0.30      0.37       827
hägersten-liljeholmen       0.39      0.23      0.29       749
          kungsholmen       0.46      0.29      0.35      1588
             norrmalm       0.59      0.28      0.38      1491
            södermalm       0.65      0.92      0.76      6471
            östermalm       0.57      0.13      0.21       924

             accuracy                           0.61     12050
            macro avg       0.52      0.36      0.39     12050
         weighted avg       0.58      0.61      0.56     12050

