In [588]:
from pathlib import Path
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [589]:
BASE_DIR = Path.cwd().parent

In [590]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [591]:
df_hosts_reviews_en_labeled['neighbourhood'] = df_hosts_reviews_en_labeled['neighbourhood'].str.lower()
df = df_hosts_reviews_en_labeled[['comments', 'neighbourhood']]
remove_neighb = 'enskede|årsta|vantörs|östermalm|norrmalm|kungsholm|skarpnäck|södermalm|skärholmen|bromma|hägersten|farsta|älvsjö|hässelby|vällingby|rinkeby|tensta|spånga|'
df.loc[:, 'comments'] = df['comments'].str.replace(remove_neighb, '', regex=True)

In [592]:
df_extended = df_hosts_reviews_en_labeled[['comments', 'neighbourhood', 'price', 'number_of_reviews', 'reviews_per_month',
       'number_of_reviews_ltm', 'review_scores_rating']]

In [593]:
def split_data(df, labels, add_features='n'):
    df_model = df[df['neighbourhood'].isin(labels) == True]
    train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42)
    vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
    X_train = vectorizer.fit_transform(train_df['comments'])
    X_test = vectorizer.transform(test_df['comments'])
    if add_features == 'y':
        scaler = MinMaxScaler()
        train_price_scaled = scaler.fit_transform(train_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 5))
        test_price_scaled = scaler.transform(test_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 5))
    
        # Concatenate the features
        X_train = np.hstack((X_train.toarray(), train_price_scaled))
        X_test = np.hstack((X_test.toarray(), test_price_scaled))
    y_train = train_df['neighbourhood']
    y_test = test_df['neighbourhood']
    return X_train, X_test, y_train, y_test

In [594]:
def classifier_model(df, labels, classifier_, model_id, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
    # Train the classifier
    classifier = classifier_
    classifier.fit(X_train, y_train)
    
    # save model
    dump(classifier, f'saved_models/model_{classifier_}_{model_id}.joblib')

In [595]:
def evaluate_model(df, labels, model_name, dd_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, dd_features)
    classifier = load(BASE_DIR / 'models/saved_models' / f'{model_name}.joblib')
   # Test the classifier and evaluate its accuracy
    accuracy = classifier.score(X_test, y_test)
    print("Accuracy:", accuracy)
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    fig = px.imshow(cm, 
                x=labels, y=labels, 
                text_auto=True,
                color_continuous_scale='Blues'
                )
    fig.update_layout(title='Confusion Matrix',
                    xaxis=dict(title='Predicted Label'),
                    yaxis=dict(title='True Label'))
    fig.show()
    # y_true and y_pred are arrays of shape (n_samples, n_classes)
    # where n_samples is the number of samples and n_classes is the number of classes
       
    precision = precision_score(y_test, y_pred, average='weighted').round(2)
    recall = recall_score(y_test, y_pred, average='weighted').round(2)
    f1 = f1_score(y_test, y_pred, average='weighted').round(2)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print()
    cm_ = multilabel_confusion_matrix(y_test, y_pred)

    for i, label in enumerate(labels):
        tn, fp, fn, tp = cm_[i].ravel()
        print("Label:", label)
        print("True Negative:", tn)
        print("False Positive:", fp)
        print("False Negative:", fn)
        print("True Positive:", tp)

In [596]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '4n')
evaluate_model(df, labels, 'model_MultinomialNB()_4n')

Accuracy: 0.48198757763975153


Precision: 0.52
Recall: 0.48
F1-score: 0.45

Label: enskede-årsta-vantör
True Negative: 3889
False Positive: 120
False Negative: 560
True Positive: 261
Label: kungsholmen
True Negative: 1879
False Positive: 1290
False Negative: 556
True Positive: 1105
Label: norrmalm
True Negative: 2351
False Positive: 1006
False Negative: 622
True Positive: 851
Label: östermalm
True Negative: 3869
False Positive: 86
False Negative: 764
True Positive: 111


In [597]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '5n')
evaluate_model(df, labels, 'model_MultinomialNB()_5n')

Accuracy: 0.5882665250862755


Precision: 0.67
Recall: 0.59
F1-score: 0.47

Label: enskede-årsta-vantör
True Negative: 10447
False Positive: 16
False Negative: 777
True Positive: 61
Label: kungsholmen
True Negative: 9600
False Positive: 81
False Negative: 1478
True Positive: 142
Label: norrmalm
True Negative: 9783
False Positive: 33
False Negative: 1369
True Positive: 116
Label: södermalm
True Negative: 410
False Positive: 4523
False Negative: 45
True Positive: 6323
Label: östermalm
True Negative: 10311
False Positive: 0
False Negative: 984
True Positive: 6


In [598]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '4y', add_features='y')
evaluate_model(df_extended, labels, 'model_MultinomialNB()_4y')

In [None]:
labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '5y', add_features='y')
evaluate_model(df_extended, labels, 'model_MultinomialNB()_5y')

labels = ['enskede-årsta-vantör', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), add_features='y')