In [None]:
%run ./imp.ipynb

In [None]:
%run ./score.ipynb

In [None]:
%run ./plot.ipynb

In [115]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from typing import List
import xgboost as xgb


class Classification(EM_impute):

    def __init__(self, data, x_cols: List[str], y_col: str, array = None):

        self.data = data
        self.x_cols = x_cols
        self.y_col = y_col
        super().__init__(array)

    @staticmethod
    def _save_data(data, name: str):

        pd.DataFrame(data).to_csv(f"./data/{name}.csv")

    @staticmethod
    def _parse_columns(df, cols = {}):
        for col in list(df.columns.values):
            new_col = col.replace("[", "")
            new_col = new_col.replace("]", "")
            
            cols.update({col: new_col})

        return df.rename(columns=cols)

    def _get_baseline_clf(self):
        self._split_data()

        dummy_clf = DummyClassifier(strategy="most_frequent")
        dummy_clf.fit(self.X_train, self.y_train)

        self.dummy = dummy_clf.predict(self.X_test)

        print(dummy_clf.score(self.X_test, self.y_test))

    
    def _split_data(self):

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                                                                                self.data[self.x_cols], 
                                                                                self.data[self.y_col], 
                                                                                test_size=0.33, 
                                                                                random_state=42
                                                                            )
    
    def _xgb_fit(self):

        self.data = self._parse_columns(self.data)
        self._split_data()       

        xgb_classifier = xgb.XGBClassifier()
        self.fitted_clf = xgb_classifier.fit(self.X_train, self.y_train)


    def _classify_xgboost(self):

        self._xgb_fit()
        self.xgb_predictions = self.fitted_clf.predict(self.X_test)

        return self.xgb_predictions

    def _probas_xgboost(self):

        
        self.xgb_probas = self.fitted_clf.predict_proba(self.X_test)

        return self.xgb_probas

    def _classify(self):
        self._classify_xgboost()
        self._get_baseline_clf()
        self._probas_xgboost()

        self._save_data(self.dummy, "dummy")
        self._save_data(self.xgb_probas, "xgb")
        self._save_data(self.xgb_predictions, "xgb_proba")

        return self.y_test, self.dummy, self.xgb_predictions, self.xgb_probas

'''    
df = pd.read_csv("./ai4i2020.csv")
df = df[['Air temperature [K]', 'Machine failure', 'Torque [Nm]', 'Tool wear [min]', 'Rotational speed [rpm]']]

c = Classification(
                    data = df,
                    x_cols = ['Air temperature K', 'Torque Nm', 'Tool wear min', 'Rotational speed rpm'],
                    y_col = 'Machine failure'  
                )

y, dummy, x, xgb_probas = c._classify()
'''


0.9693939393939394
