In [1]:
import json

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
class CategoriesExtractor(BaseEstimator, TransformerMixin):
    """Extract Categories from json string.

    By default it will only keep the hardcoded categories defined below
    to avoid having too many dummies."""

    misc = "misc"
    gen_cats = ["music", "film & video", "publishing", "art", "games"]
    precise_cats = [
        "rock", "fiction", "webseries", "indie rock", "children's books",
        "shorts", "documentary", "video games"
    ]

    @classmethod
    def _get_slug(cls, x, validate=True):
        categories = json.loads(x).get("slug", "/").split("/")

        # Validate categories to keep only
        # the most common ones
        if validate:
            if categories[0] not in cls.gen_cats:
                categories[0] = cls.misc
            if categories[1] not in cls.precise_cats:
                categories[1] = cls.misc

        return categories

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        categories = X["category"]
        return pd.DataFrame({
            "gen_cat": categories.apply(lambda x: self._get_slug(x)[0]),
            "precise_cat": categories.apply(lambda x: self._get_slug(x)[1])
        })


In [3]:
class GoalAdjustor(BaseEstimator, TransformerMixin):
    """Adjusts the goal feature to USD"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"adjusted_goal": X.goal * X.static_usd_rate})


In [4]:
class TimeTransformer(BaseEstimator, TransformerMixin):
    """Builds features computed from timestamps"""

    adj = 1_000_000_000

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        deadline = pd.to_datetime(X.deadline * self.adj)
        created = pd.to_datetime(X.created_at * self.adj)
        launched = pd.to_datetime(X.launched_at * self.adj)

        return pd.DataFrame({
            "launched_to_deadline": (deadline - launched).dt.days,
            "created_to_launched": (launched - created).dt.days
        })


In [5]:
class CountryTransformer(BaseEstimator, TransformerMixin):
    """Transform countries into larger groups to avoid having
    too many dummies."""

    countries = {
        'US': 'US',
        'CA': 'Canada',
        'GB': 'UK & Ireland',
        'AU': 'Oceania',
        'IE': 'UK & Ireland',
        'SE': 'Europe',
        'CH': "Europe",
        'IT': 'Europe',
        'FR': 'Europe',
        'NZ': 'Oceania',
        'DE': 'Europe',
        'NL': 'Europe',
        'NO': 'Europe',
        'MX': 'Other',
        'ES': 'Europe',
        'DK': 'Europe',
        'BE': 'Europe',
        'AT': 'Europe',
        'HK': 'Other',
        'SG': 'Other',
        'LU': 'Europe'
    }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"country": X.country.map(self.countries)})


In [6]:
class KickstarterModel:

    # Update parameters here after re-tuning the model
    params = {"penalty": "l1", "C": 1.7, "solver": "liblinear"}

    def __init__(self):

        self.model = None
        self.preprocessor = None

    def preprocess_training_data(self, df):
        # Processor for categories with one-hot encoding
        cat_processor = Pipeline([("extractor", CategoriesExtractor()),
                                  ("one_hot",
                                   OneHotEncoder(sparse=False,
                                                 handle_unknown="ignore"))])

        # Processor for countries with one-hot encoding
        country_processor = Pipeline([("transfomer", CountryTransformer()),
                                      ("one_hot",
                                       OneHotEncoder(sparse=False,
                                                     handle_unknown="ignore"))])

        # First level of column specific transformations
        col_transformer = ColumnTransformer([
            ("goal", GoalAdjustor(), ["goal", "static_usd_rate"]),
            ("categories", cat_processor, ["category"]),
            ("disable_communication", "passthrough", ["disable_communication"]),
            ("time", TimeTransformer(),
             ["deadline", "created_at", "launched_at"]),
            ("countries", country_processor, ["country"])
        ])

        # Add a scaling stage
        self.preprocessor = Pipeline([("col_transformer", col_transformer),
                                      ("scaler", StandardScaler())])

        # Return X_train and y_train
        X_train = self.preprocessor.fit_transform(df.drop("state", axis=1))
        y_train = df.state.map({"failed": 0, "successful": 1})

        return X_train, y_train

    def fit(self, X, y):
        self.model = LogisticRegression(**self.params)
        self.model.fit(X, y)

    def preprocess_unseen_data(self, df):
        X_test = self.preprocessor.transform(df)
        return X_test

    def predict(self, X):

        return self.model.predict(X)
