In [175]:
from pathlib import Path
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [118]:
BASE_DIR = Path.cwd().parent

In [119]:
df_hosts_reviews_en_labeled = pd.read_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [120]:
df_hosts_reviews_en_labeled['neighbourhood'] = df_hosts_reviews_en_labeled['neighbourhood'].str.lower()
df = df_hosts_reviews_en_labeled[['comments', 'neighbourhood']]
remove_neighb = 'enskede|årsta|vantörs|östermalm|norrmalm|\
                    kungsholm|skarpnäck|södermalm|skärholmen|\
                    bromma|hägersten|liljeholmen|farsta|älvsjö|hässelby|\
                    vällingby|rinkeby|tensta|spånga|'
df.loc[:, 'comments'] = df['comments'].str.replace(remove_neighb, '', regex=True)

In [121]:
df_extended = df_hosts_reviews_en_labeled[['comments', 'neighbourhood', 
                                           'price', 'number_of_reviews', 'reviews_per_month',
                                          'number_of_reviews_ltm', 'room_type']]
df_extended = pd.get_dummies(df_extended, columns=['room_type'], drop_first=True)
cols = ['room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room']
df_extended[cols] = df_extended[cols].astype(int)

In [122]:
def split_data(df, labels, add_features='n'):
    df_model = df[df['neighbourhood'].isin(labels) == True]
    y = df_model['neighbourhood']
    train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=y)
    vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
    X_train = vectorizer.fit_transform(train_df['comments'])
    X_test = vectorizer.transform(test_df['comments'])

    if add_features == 'y':
        scaler = MinMaxScaler()
        train_price_scaled = scaler.fit_transform(train_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
        test_price_scaled = scaler.transform(test_df.drop(['comments', 'neighbourhood'], axis=1).values.reshape(-1, 7))
    
        # Concatenate the features
        X_train = np.hstack((X_train.toarray(), train_price_scaled))
        X_test = np.hstack((X_test.toarray(), test_price_scaled))

    y_train = train_df['neighbourhood']
    y_test = test_df['neighbourhood']
    return X_train, X_test, y_train, y_test

In [123]:
def classifier_model(df, labels, classifier_, model_id, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
 
    # Train the classifier
    classifier = classifier_
    classifier.fit(X_train, y_train)
  
    # save model
    dump(classifier, f'saved_models/model_{classifier_}_{model_id}.joblib')

In [124]:
def evaluate_model(df, labels, model_name, add_features='n'):
    X_train, X_test, y_train, y_test = split_data(df, labels, add_features)
    classifier = load(BASE_DIR / 'models/saved_models' / f'{model_name}.joblib')

    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=labels))

    cm = confusion_matrix(y_test, y_pred)
    fig = px.imshow(cm, 
                x=labels, y=labels, 
                text_auto=True,
                color_continuous_scale='Blues'
                )
    fig.update_layout(title='Confusion Matrix',
                    xaxis=dict(title='Predicted Label'),
                    yaxis=dict(title='True Label'))
    fig.show()

In [125]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '2n')
evaluate_model(df, labels, 'model_MultinomialNB()_2n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.77      0.64      0.70       826
           östermalm       0.72      0.83      0.77       926

            accuracy                           0.74      1752
           macro avg       0.74      0.73      0.74      1752
        weighted avg       0.74      0.74      0.74      1752



In [170]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'skärholmen', 'södermalm', 'östermalm']
classifier_model(df, labels, MultinomialNB(), '7n')
evaluate_model(df, labels, 'model_MultinomialNB()_7n')

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.73      0.10      0.18       826
hägersten-liljeholmen       0.67      0.02      0.03       749
          kungsholmen       0.51      0.08      0.14      1587
             norrmalm       0.81      0.08      0.14      1492
           skärholmen       0.00      0.00      0.00       188
            södermalm       0.55      1.00      0.71      6472
            östermalm       1.00      0.00      0.01       926

             accuracy                           0.55     12240
            macro avg       0.61      0.18      0.17     12240
         weighted avg       0.62      0.55      0.42     12240



In [127]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, GradientBoostingClassifier(random_state=42), '2n')

In [128]:
evaluate_model(df, labels, 'model_GradientBoostingClassifier(random_state=42)_2n')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.69      0.67      0.68       826
           östermalm       0.71      0.73      0.72       926

            accuracy                           0.70      1752
           macro avg       0.70      0.70      0.70      1752
        weighted avg       0.70      0.70      0.70      1752



In [12]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df_extended, labels, GradientBoostingClassifier(random_state=42), '2y', add_features='y')

In [13]:
evaluate_model(df_extended, labels, 'model_GradientBoostingClassifier(random_state=42)_2y', add_features='y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.92      0.93      0.93       827
           östermalm       0.94      0.93      0.94       924

            accuracy                           0.93      1751
           macro avg       0.93      0.93      0.93      1751
        weighted avg       0.93      0.93      0.93      1751



In [171]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'skärholmen', 'södermalm', 'östermalm']
classifier_model(df, labels, GradientBoostingClassifier(random_state=42), '7n')

In [172]:
evaluate_model(df, labels, 'model_GradientBoostingClassifier(random_state=42)_7n')

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.54      0.21      0.30       826
hägersten-liljeholmen       0.45      0.11      0.17       749
          kungsholmen       0.65      0.13      0.22      1587
             norrmalm       0.73      0.12      0.21      1492
           skärholmen       0.32      0.14      0.19       188
            södermalm       0.58      0.98      0.73      6472
            östermalm       0.80      0.08      0.15       926

             accuracy                           0.58     12240
            macro avg       0.58      0.25      0.28     12240
         weighted avg       0.61      0.58      0.48     12240



In [134]:
labels = ['enskede-årsta-vantör', 'östermalm']
classifier_model(df, labels, LogisticRegression(), '2y')

In [135]:
evaluate_model(df, labels, 'model_LogisticRegression()_2y')

                      precision    recall  f1-score   support

enskede-årsta-vantör       0.73      0.69      0.71       826
           östermalm       0.74      0.78      0.76       926

            accuracy                           0.74      1752
           macro avg       0.74      0.73      0.73      1752
        weighted avg       0.74      0.74      0.74      1752



In [173]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'skärholmen', 'södermalm', 'östermalm']
classifier_model(df, labels, LogisticRegression(), '7n')

In [174]:
evaluate_model(df, labels, 'model_LogisticRegression()_7n')

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.47      0.27      0.34       826
hägersten-liljeholmen       0.33      0.13      0.19       749
          kungsholmen       0.46      0.24      0.31      1587
             norrmalm       0.49      0.17      0.26      1492
           skärholmen       0.95      0.10      0.17       188
            södermalm       0.61      0.94      0.74      6472
            östermalm       0.46      0.07      0.13       926

             accuracy                           0.58     12240
            macro avg       0.54      0.28      0.31     12240
         weighted avg       0.55      0.58      0.51     12240



In [160]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'södermalm', 'östermalm']

In [161]:
classifier = load(BASE_DIR / 'models/saved_models' / 'model_GradientBoostingClassifier(random_state=42)_6n.joblib')

In [166]:
df_model = df[df['neighbourhood'].isin(labels) == True]
y = df_model['neighbourhood']
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
X_train = vectorizer.fit_transform(train_df['comments'])
X_test = vectorizer.transform(test_df['comments'])

In [167]:
column_names_from_text_features = vectorizer.vocabulary_
rev_dictionary = {v:k for k,v in vectorizer.vocabulary_.items()}
column_names_from_text_features = [v for k,v in rev_dictionary.items()]

In [85]:
#all_columns = column_names_from_text_features + test_df.drop(['comments', 'neighbourhood'], axis=1).columns.tolist()

In [168]:
importances = pd.DataFrame({
    'Feature': column_names_from_text_features,
    'GBR': classifier.feature_importances_
})
importances = importances.sort_values('GBR', ascending=True)

In [169]:
fig = px.bar(importances.nlargest(20, 'GBR'), x='GBR', y='Feature')
fig.update_xaxes(title='F-score')
fig.update_yaxes(tickmode='linear')

In [176]:
labels = ['enskede-årsta-vantör', 'hägersten-liljeholmen', 'kungsholmen', 'norrmalm', 'skärholmen', 'södermalm', 'östermalm']

In [177]:
df_model_1 = df[df['neighbourhood'].isin(labels) == True]
y = df_model_1['neighbourhood']
train_df, test_df = train_test_split(df_model_1, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(stop_words='english',
                            ngram_range=(1,2),
                            min_df=10,
                            max_df=0.9
                            )
X_train = vectorizer.fit_transform(train_df['comments'])
X_test = vectorizer.transform(test_df['comments'])
y_train = train_df['neighbourhood']
y_test = test_df['neighbourhood']

In [180]:
cl = GradientBoostingClassifier(random_state=42)
cl.fit(X_train, y_train)
parameters = {
            'n_estimators' : [100 ,300, 500],
            "max_depth": [1, 3, 5],
            "max_features": [1.0, 'sqrt']
            }
gbr_search =GridSearchCV(
        estimator=classifier,
        param_grid=parameters,
        cv=5,
        n_jobs=6)
        
gbr_search.fit(X_train, y_train)
gbr_search.best_params_



In [116]:
pred = cl.predict(X_test)

In [117]:
print(classification_report(y_test, pred, target_names=labels))

                       precision    recall  f1-score   support

 enskede-årsta-vantör       0.87      0.68      0.77       827
hägersten-liljeholmen       0.89      0.62      0.73       749
          kungsholmen       0.94      0.74      0.83      1588
             norrmalm       0.97      0.66      0.79      1491
            södermalm       0.80      0.99      0.89      6471
            östermalm       0.97      0.64      0.77       924

             accuracy                           0.85     12050
            macro avg       0.91      0.72      0.80     12050
         weighted avg       0.86      0.85      0.84     12050

