In [44]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [3]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)
print(users_df)

     user_id             name      city                 street
0        102  Monika Forysiak    Poznań      plac Dębowa 11/53
1        103  Kacper Malewicz   Wrocław   aleja Browarna 79/72
2        104    Tomasz Janiuk    Kraków  ulica Cegielniana 318
3        105    Roksana Mućka     Radom        plac Perłowa 48
4        106     Wiktor Jarka  Warszawa            al. Bema 37
..       ...              ...       ...                    ...
195      297    Błażej Pachla    Kraków      ulica Lisia 09/00
196      298     Cezary Jonak    Gdynia    aleja Kołłątaja 110
197      299     Sylwia Karol   Wrocław       al. Podleśna 999
198      300      Bruno Cisoń     Radom   ulica Malinowa 64/08
199      301        Tola Osik  Szczecin    plac Tulipanowa 386

[200 rows x 4 columns]


In [45]:
# Getting sex out of name
for index, row in users_df.iterrows():
    if row["name"].split(" ")[0][-1] == "a":
        users_df.loc[index, "sex"] = "female"
    else:
        users_df.loc[index, "sex"] = "male"

In [46]:
print(users_df)

     user_id             name      city                 street     sex
0        102  Monika Forysiak    Poznań      plac Dębowa 11/53  female
1        103  Kacper Malewicz   Wrocław   aleja Browarna 79/72    male
2        104    Tomasz Janiuk    Kraków  ulica Cegielniana 318    male
3        105    Roksana Mućka     Radom        plac Perłowa 48  female
4        106     Wiktor Jarka  Warszawa            al. Bema 37    male
..       ...              ...       ...                    ...     ...
195      297    Błażej Pachla    Kraków      ulica Lisia 09/00    male
196      298     Cezary Jonak    Gdynia    aleja Kołłątaja 110    male
197      299     Sylwia Karol   Wrocław       al. Podleśna 999  female
198      300      Bruno Cisoń     Radom   ulica Malinowa 64/08    male
199      301        Tola Osik  Szczecin    plac Tulipanowa 386  female

[200 rows x 5 columns]


In [18]:
def extract_color(products_df):
    for index, row in products_df.iterrows():
        color = re.search("'color': '\w+'", str(row["optional_attributes"]))
        if color != None:
            products_df.loc[index, "color"] = color.group(0)[10:-1]
        else:
            products_df.loc[index, "color"] = None
    return products_df

In [32]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], prefix="", prefix_sep="")
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [33]:
def normalize(df, columns_to_norm):
    result = df.copy()
    for feature_name in df.columns:
        if feature_name in columns_to_norm:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [52]:
#Do weryfikacji ceny monitorów
products_model_A = products_df.copy()
products_model_A = products_model_A.drop(columns=["product_name", "brand", "optional_attributes","weight_kg"])
products_model_A = encode_and_bind(products_model_A, 'category_path')
products_model_A = normalize(products_model_A, ['price','user_rating_count'])
y = products_model_A.iloc[:,0].values
X = products_model_A.drop(columns=["product_id"]).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        1002       0.00      0.00      0.00       0.0
        1003       0.00      0.00      0.00       1.0
        1006       0.00      0.00      0.00       0.0
        1007       0.00      0.00      0.00       0.0
        1008       0.00      0.00      0.00       0.0
        1009       0.00      0.00      0.00       1.0
        1010       0.00      0.00      0.00       1.0
        1011       0.00      0.00      0.00       0.0
        1012       0.00      0.00      0.00       1.0
        1014       0.00      0.00      0.00       0.0
        1015       0.00      0.00      0.00       0.0
        1017       0.00      0.00      0.00       1.0
        1018       0.00      0.00      0.00       0.0
        1019       0.00      0.00      0.00       0.0
        1024       0.00      0.00      0.00       0.0
        1027       0.00      0.00      0.00       0.0
        1032       0.00      0.00      0.00       0.0
        1033       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
