In [209]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

In [210]:
movies = pd.read_csv("mymoviedb.csv", lineterminator="\n")
movies.drop(movies[movies["Vote_Count"] == 0].index, inplace=True) # usun filmy z przyszlosci
movies.drop(["Overview", "Popularity", "Vote_Count", "Poster_Url"], axis=1, inplace=True)

X = movies.drop("Vote_Average", axis=1)
y = movies["Vote_Average"]

In [211]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [212]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [213]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)


In [214]:
from sklearn.preprocessing import OneHotEncoder

simple_cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Original_Language"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

simple_cat_pipeline.fit_transform(X_train)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [215]:
from datetime import datetime

class DateEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        cols = []
        for col_name in X.columns:
            cols.append(X[col_name].map(lambda x: datetime.strptime(x, "%Y-%m-%d").year))
            cols.append(X[col_name].map(lambda x: datetime.strptime(x, "%Y-%m-%d").month))
        return np.c_[cols].T


In [216]:
date_pipeline = Pipeline([
        ("select_date", DataFrameSelector(["Release_Date"])),
        ("imputer", MostFrequentImputer()),
        ("date_encoder", DateEncoder()),
    ])

date_pipeline.fit_transform(X_train)

array([[2015,    1],
       [2005,    9],
       [1992,    4],
       ...,
       [1983,    7],
       [2015,    7],
       [2007,    3]], dtype=int64)

In [217]:
class GenreEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, delimeter=","):
        self.genres = {}
        for col in X.columns:
            self.genres[col] = set()
            for index, row in X.iterrows():
                self.genres[col] |= set(map(lambda x: x.strip(), row[col].split(delimeter)))
        return self
        
    def transform(self, X, y=None):
        encoded = []
        for col, curr_genres in self.genres.items():
            for genre in curr_genres:
                encoded.append(X[col].str.contains(genre).astype(int))
        return np.c_[encoded].T

In [218]:
genre_pipeline = Pipeline([
        ("select_genre", DataFrameSelector(["Genre"])),
        ("imputer", MostFrequentImputer()),
        ("genre_encoder", GenreEncoder()),
    ])

genre_pipeline.fit_transform(X_train)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [219]:
import re

class TitleEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.genres = {}
        return self
        
    def transform(self, X, y=None):
        encoded = []
        for col in X.columns:
            encoded.append(X["Title"].map(lambda x: bool(re.search(r'\d', x))).astype(int)) # cyfry w tytule
            encoded.append(X["Title"].map(lambda x: len(x.split()))) # ilosc slow w tytule
            encoded.append(X["Title"].str.contains(":").astype(int)) # dwukropek w tytule
            encoded.append(X["Title"].str.contains("-").astype(int)) # pauza w tytule
        
        return np.c_[encoded].T

In [220]:
title_pipeline = Pipeline([
        ("select_title", DataFrameSelector(["Title"])),
        ("imputer", MostFrequentImputer()),
        ("title_encoder", TitleEncoder()),
    ])

title_pipeline.fit_transform(X_train)

array([[0, 2, 0, 0],
       [0, 4, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 3, 0, 0],
       [0, 5, 1, 0],
       [0, 5, 1, 0]], dtype=int64)

In [221]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("simple_cat_pipeline", simple_cat_pipeline),
        ("date_pipeline", date_pipeline),
        ("genre_pipeline", genre_pipeline),
        ("title_pipeline", title_pipeline),
    ])

In [225]:
preprocess_pipeline.transform(X_train).shape

(8754, 66)