In [130]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
from sklearn import preprocessing

In [190]:
body = json.loads('''{
   "dataset_name":"fish",
   "y_column_name":"Weight",
   "label_fields": ["Species"],
   "preprocessors":[
      {
         "name":"Variance Threshold",
         "selected":true,
         "values":[
            {
               "Label":"Threshold",
               "name": "threshold",
               "type":"range",
               "range":[
                  0,
                  1,
                  0.01
               ],
               "value":0.01
            }
         ]
      }, {
          "name": "Correlation",
          "selected": true,
          "values": [
                    {
                       "Label":"Threshold",
                       "name": "threshold",
                       "type":"range",
                       "range":[
                          0,
                          1,
                          0.01
                       ],
                       "value":0.9999
                    }
                 ]
      }
   ],
   "models": [
       {
           "name": "Linear Regression"
       }
   ]
}''')

In [191]:
class CoorelationModel:
    def __init__(self, fit_dataset: pd.DataFrame=pd.DataFrame(), threshold=0.1):
        self.col_corr = set()
        self.threshold = threshold
        if not fit_dataset.empty:
            self.fit(fit_dataset)
    
    def fit(self, fit_dataset):
        corr_matrix = fit_dataset.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    self.col_corr.add(corr_matrix.columns[i])
    
    def transform(self, df: pd.DataFrame()):
        return df.drop(self.col_corr, axis=1)

class VarianceThModel:
    def __init__(self, fit_dataset: pd.DataFrame=pd.DataFrame(), threshold=0.1):
        self.col_corr = set()
        self.thresold = threshold
        self.var_thres = VarianceThreshold(threshold)
        
        if not fit_dataset.empty:
            self.fit(fit_dataset)
    
    def fit(self, fit_dataset):
        self.var_thres.fit(fit_dataset)

    
    def transform(self, df: pd.DataFrame()):
        return pd.DataFrame(self.var_thres.transform(df), columns=df.columns[self.var_thres.get_support()])

In [192]:

def get_linear_regression_score(xt, xs, yt, ys):
    lr = LinearRegression()
    lr.fit(xt, yt)
    prediction = lr.predict(xs)    
    # print("YO", prediction.shape, ys.values.shape)
    # return lr.score(ys.values, prediction), lr, prediction
    # return lr.score(xs, ys), lr, []
    return r2_score(ys, prediction), lr, []


def selection_variance_threshold(dataset: pd.DataFrame, threshold=0.2):
    var_thres = VarianceThreshold(threshold)
    var_thres.fit(dataset)
    return var_thres    

def v(body):
    dataset_name = body.get("dataset_name", "#")
    label_fields = body.get("label_fields", [])
    label_encoders = {x: preprocessing.LabelEncoder() for x in label_fields}
    
    test_size = body.get("test_size", 0.2)
    y_column = body.get("y_column_name", "Y")
    preprocessors = body.get("preprocessors", [])
    comparision_models = body.get("models", [])
    print(comparision_models)
    df = pd.DataFrame()
    try:
        df = pd.read_feather(f'../server/files/{dataset_name}.feather')
    except Exception as e:
        return json.dumps({'error': True, 'message': "Select a dataset name", 'message-2': str(e)})

    for label_field in label_fields:
        label_encoders[label_field].fit(df[label_field])
        df[label_field]=label_encoders[label_field].transform(df[label_field])
    
    print(df.nunique())
    df = pd.DataFrame(preprocessing.StandardScaler().fit_transform(df), columns=df.columns)
    
    
    xt, xs, yt, ys = train_test_split(
        df.drop(labels=[y_column], axis=1),
        df[y_column],
        test_size=test_size,
        random_state=1,
    )
    xto, xso = xt.copy(), xs.copy()

    for preprocessor in preprocessors:
        if not preprocessor['selected']: continue;
        try:
            fs = {
                'Variance Threshold': VarianceThModel,
                'Correlation': CoorelationModel,
            }[preprocessor['name']](xt, **{v['name']: v['value'] for v in preprocessor['values']})
            print(preprocessor['name'])
            print(f"PP VT XT:", len(xt.columns))
            print(f"PP VT XTO:", len(xto.columns))
            xt, xs = fs.transform(xt), fs.transform(xs)
            print(f"PP VT XT:", len(xt.columns))
            print(f"PP VT XTO:", len(xto.columns), end='-'*50)
        except Exception as e:
            raise e

        # if preprocessor['name'] == 'Variance Threshold':
        #     vt = selection_variance_threshold(xt, **{v['name']: v['value'] for v in preprocessor['values']})

    score, trained_model, predictions = 0, None, []
    
    scores = []
    for cm in comparision_models:
        if cm['name'] == 'Linear Regression':
            score, trained_model, predictions = get_linear_regression_score(xt, xs, yt, ys)
            scoreo, _, _ = get_linear_regression_score(xto, xso, yt, ys)
            # plt.scatter(xt.iloc[:, [0]], yt)
            scores.append({'score': score, 'scoreo': scoreo})
            

    return json.dumps(scores)
v(body)

[{'name': 'Linear Regression'}]
Species      7
Weight     101
Length1    116
Length2     93
Length3    124
Height     154
Width      152
dtype: int64
Variance Threshold
PP VT XT: 6
PP VT XTO: 6
PP VT XT: 6
PP VT XTO: 6--------------------------------------------------Correlation
PP VT XT: 6
PP VT XTO: 6
PP VT XT: 6
PP VT XTO: 6--------------------------------------------------

'[{"score": 0.8856778285607882, "scoreo": 0.8856778285607882}]'

In [50]:
data

NameError: name 'data' is not defined

In [124]:
df = pd.read_csv("../server/files/fish.csv")
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [126]:
df.to_feather("../server/files/fish.feather")