In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from time import time
from typing import List
from sklearn.model_selection import train_test_split
import pandas as pd
from typing import List

In [2]:
def target_encode_mean(df: pd.DataFrame, column: str, min_category_size: int, target_column: str) -> (pd.Series, pd.Series):
    """
    Converts a categorical variable to a target encoded variable based on the means of the target variable.
    """
    # NaN cannot be used as a key so we have to convert NaNs to string
    feature_orig = df[column].replace(np.nan, 'NaN')
    counts_distinct = feature_orig.value_counts()
    counts = feature_orig.map(counts_distinct)
    mean_mapping = df.groupby(column).mean()[target_column]
    feature_mapped = feature_orig.map(mean_mapping)
    feature_mapped[counts < min_category_size] = df[target_column].mean()
    return feature_mapped, mean_mapping


def build_features(df: pd.DataFrame, categorical_features: list, target_column: str):
    print('Starting feature engineering.')
    start_time = time()
    # Category must include at least this many samples. If not, then it's encoded to global mean of the target variable.
    min_category_size = 100
    mean_mappings = {}
    #     In our specific dataset, all of the columns are categorical
    features = categorical_features
    X = pd.DataFrame(columns=features)
    for feature in categorical_features:
        X[feature], mean_mappings[feature] = target_encode_mean(df, feature, min_category_size, target_column)
        
    y = df[target_column]

    print(f'Features and target split. Took {time()-start_time:.2f} s.')
    return X, y, mean_mappings

def map_features(df: pd.DataFrame, feature_columns: List[str], mappings: List[pd.Series], target_column: str):
    X = pd.DataFrame(columns=mappings.keys())
    for column in mappings.keys():
        X[column] = df[column].map(mappings[column])
    y = df[target_column]
    return X, y

In [3]:
data = pd.read_csv('data/train.csv')
submission_test_data = pd.read_csv('data/test.csv')

In [14]:
X_features = ["action_recommendation_id", 
              "action_recommendation_type", "action_recommendation_category", 
              "equipment_area", "usage_type", "speed_category",
             "load_category", "floors_category", "equipment_category"]

knn_data = data[data.feedback == 1]
knn_data = data[X_features]

In [21]:
knn_data.speed_category = knn_data.speed_category.astype(str)
knn_data.load_category = knn_data.load_category.astype(str)
knn_data.floors_category = knn_data.floors_category.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [8]:
y_target = "action_recommendation_type"

train_data, test_data = train_test_split(data, test_size=0.2)
X_train, y_train, feature_mappings_binary = build_features(train_data, X_features, y_target)
# X_test, y_test = map_features(test_data, categorical_features_binary, feature_mappings_binary, 'feedback')

Starting feature engineering.


KeyError: 'action_recommendation_type'

In [27]:
sparse_knn_np = pd.get_dummies(knn_data).values

In [28]:
from scipy import sparse
sparse_knn = sparse.csr_matrix(sparse_knn_np)

In [30]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(sparse_knn, None)

ValueError: This KNeighborsClassifier estimator requires y to be passed, but the target y is None.