In [114]:
#### from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from datetime import datetime
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.neighbors import KNeighborsRegressor

import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

def fit_and_print_scores(pipeline, X_train, test_X, y_train, test_y):
    pipeline.fit(X_train, y_train)
    regression_name = pipeline.named_steps["regressor"].__class__.__name__
    feature_selection_name = pipeline.named_steps["feature_selection"].__class__.__name__
    test_predictions = pipeline.predict(test_X)
    mae = mean_absolute_error(test_y, test_predictions)

    r2 = r2_score(test_y, test_predictions)
    
    feature_selection_step = pipeline.named_steps['feature_selection']

    # Get the selected features using the get_support() method
    selected_features = feature_selection_step.get_support()

    # Print the selected features
    print("Top 3 features from ",feature_selection_name,":")
    count = 0
    for feature, selected in zip(X_train.columns, selected_features):
        if selected:
            if feature == "Initial release date":
                print("-  Weekday")
            else:
                print("- ", feature)
            count += 1
            if count == 3:
                break
    
    print("\nScores for ", feature_selection_name, ":")
    print("Mean Absolute Error: " , mae)
    print("R^2 score:", r2)
    print("________________________________________________________")


def transform_to_weekday(value):
    date_format = "%B %d, %Y"
    date_object = datetime.strptime(value, date_format)
    return date_object.weekday()

def transform_to_quarter(value):
    # Perform your transformation logic here
    date_format = "%B %d, %Y"
    date_object = datetime.strptime(value, date_format)
    return (date_object.month - 1) // 3 + 1

def preprocess_multicat(dataFrame, columns, separator): 
    mlb = MultiLabelBinarizer() # Not possible on a pipeline https://github.com/scikit-learn/scikit-learn/issues/11309
    for cat in columns:
        dataFrame[cat] = dataFrame[cat].str.split(separator).apply(lambda x: [str(i).strip() for i in x])
        encoded_features = mlb.fit_transform(dataFrame[cat])
        encoded_df = pd.DataFrame(encoded_features, columns=mlb.classes_)
        dataFrame = pd.concat([dataFrame, encoded_df], axis=1)
    return dataFrame


# Custom transformer to extract release quarter
    # For some godforsaken reason, it did not recognize the release data as a Datetime object, so I had to apply a transform 
def extract_release_quarter(df):
    df['Release Quarter'] = df['Initial release date'].apply(transform_to_quarter)
    return df

# Custom transformer to extract release day of week
def extract_release_day_of_week(df):
    # Could not get rid of the original column, so I replaced the values with the weekday column's values
    df['Initial release date'] = df['Initial release date'].apply(transform_to_weekday)
    return df

def drop_column(df, column_to_drop):
    return df.drop(column_to_drop)

# Define the pipeline

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ] 
)

pipeline = Pipeline(
    steps=[
    ('release_quarter', FunctionTransformer(extract_release_quarter)),
    ('release_day_of_week', FunctionTransformer(extract_release_day_of_week)),
    #('drop_column', FunctionTransformer(drop_column, kw_args={'column_to_drop': 'Initial release date'}))
    
])

# Define the column transformer for applying transformations to specific columns
preprocessor = ColumnTransformer([
    ('release_date_transformer', pipeline, ['Initial release date']),
    ('series_encoder', categorical_transformer, ["Series"])
])

regr_forest = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("feature_selection", 'passthrough'),# placeholder for when we loop 3 different methods into it.
           ("regressor", KNeighborsRegressor(n_neighbors=4))]
)
df = pd.read_csv('best-selling video games of all time.csv')  
df = df.drop(df.index[:5]) # The top 5 are the outliers of the outliers, remove them.
df = df.reset_index(drop=True) # and reset the index in case we loop with index later on

categorical_features = [ "Platform(s)","Developer(s)",  "Publisher(s)"] # not suited for OneHotEncoder
df = preprocess_multicat(df, categorical_features, '/')

X = pd.concat([df['Initial release date'], df.iloc[:, 8:], df['Series']], axis=1)

y = df['Sales']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=10)

print("________________________PIPELINE RESULTS________________________")
for feature_selection_method in [SelectKBest(k=4),SelectFromModel(GradientBoostingRegressor()),  RFE(estimator=LogisticRegression()) ]:
    regr_forest.set_params(feature_selection=feature_selection_method)                                                                                              
    fit_and_print_scores(regr_forest, train_X, test_X, train_y, test_y)

    

________________________PIPELINE RESULTS________________________
Top 3 features from  SelectKBest :
-  Xbox 360
-  Bandai Namco Studios
-  Blizzard Entertainment

Scores for  SelectKBest :
Mean Absolute Error:  5607777.777777778
R^2 score: -0.011072460068845436
________________________________________________________
Top 3 features from  SelectFromModel :
-  Weekday
-  Game Boy
-  Nintendo Switch

Scores for  SelectFromModel :
Mean Absolute Error:  6647313.055555556
R^2 score: 0.03636138588046178
________________________________________________________
Top 3 features from  RFE :
-  Weekday
-  Game Boy
-  Multi-platform

Scores for  RFE :
Mean Absolute Error:  6485090.833333333
R^2 score: -0.09201565280949309
________________________________________________________
