In [None]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install openai



In [None]:
import pandas as pd
import numpy as np
import openai
openai.api_key = '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'
import ast

In [None]:
"""
MODELS REIMPLEMENT

Labelled Classification:
  Linear SVC
  SGD Classifier
  Kernel Approximation
  KNeighbors Classifier
  SVC
  Ensemble Classifiers

Regression:
  Ridge Regression
  SGD Regressor
  LASSO
  ElasticNet
  SVR
  Ensemble Regressors
"""

'\nMODELS REIMPLEMENT\n\nLabelled Classification:\n  Linear SVC\n  SGD Classifier\n  Kernel Approximation\n  KNeighbors Classifier\n  SVC\n  Ensemble Classifiers\n\nRegression:\n  Ridge Regression\n  SGD Regressor\n  LASSO\n  ElasticNet\n  SVR\n  Ensemble Regressors\n'

In [None]:
from sklearn.preprocessing import PowerTransformer, LabelEncoder
from sklearn.model_selection import train_test_split as tt_split
from sklearn.metrics import accuracy_score, mean_squared_error as mse
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import SGDClassifier, Ridge
from sklearn.kernel_approximation import RBFSampler
import pickle
import uuid


class DAML:
  def __init__(self, df, task, verbose = 0, forceTask=None):
    self.df = df
    if forceTask is None:
      self.task, self.target_column = self.process_NLP_task(task)
      if verbose == 1:
        print("TASK:", self.task)
        print("TARGET COLUMN:", self.target_column)
    else:
      self.task, self.target_column = forceTask
    self.models = []
    self.verbose = verbose

    # find target column based off the task.targetColumn after NLP processing

    self.clean()

    # Split data into 60% training, 20% validation, 20% test.
    # Ensure that data is the same across all models for accurate comparison
    X = self.df.drop(self.target_column, axis=1)
    y = self.df[self.target_column]

    X_temp, self.X_test, y_temp, self.y_test = tt_split(X, y,test_size=0.2, random_state=42)
    self.X_train, self.X_val, self.y_train, self.y_val = tt_split(X_temp, y_temp, test_size = 0.25, random_state=42)

    self.model()

    # sort the models by accuracy desc
    reverse_order = self.task != 'regression'
    self.models.sort(key=lambda x: x[1], reverse=reverse_order)

    # dump best model
    if len(self.models) > 0:
      pickle.dump(self.models[0][0], open(f"./models/{str(uuid.uuid4())}-model.sav", 'wb'))

  def process_NLP_task(self, task, processor="open-ai"):
    column_string = ', '.join(list(self.df.columns))
    query = f"{task}. The columns for my dataset are {column_string}"
    message = "You are a natural language processor that can talk exclusively in JSON. You are tasked with analysing a query for a dataset. For example 'I want to classify more results based on an income of >=50k usd. The columns for my dataset are 'age','profession','nationality','income'' would return {'task': 'classification (labelled)', 'targetColumn':'income}'. Target column must exist within the given set of columns, task must be one of 'classification (unlabelled) or 'regression'. Your task is: " + query

    if processor == "open-ai":
      messages = [{"role":"user", "content": message}]
      chat = openai.ChatCompletion.create(model='gpt-3.5-turbo', messages = messages)
      reply = chat.choices[0].message.content
      response = ast.literal_eval(reply)
      return (response['task'], response['targetColumn'])
    else:
      return ("classification (labelled)", "price_range")

  def clean(self):
    # eliminate NaN values
    # ->> drop records/columns
    # ->> impute values
    # normalise values
    # encode categorical data

    # deal with missing values
    self.df.dropna(axis=0, inplace=True)
    # encode categorical data
    cols = df.columns
    numerical_cols = df._get_numeric_data().columns
    categorical_cols = list(set(cols) - set(numerical_cols))
    for col_name in categorical_cols:
      le = LabelEncoder()
      self.df[col_name] = le.fit_transform(self.df[col_name])


    # normalise value
    transformer = PowerTransformer(method='yeo-johnson')
    X_cols = list(self.df.columns)
    X_cols.remove(self.target_column)

    for col in X_cols:
      self.df[col] = transformer.fit_transform(self.df[col].to_numpy().reshape(-1, 1))


  def model(self, big_dataset_size=100000):
    if self.task == 'classification (labelled)':
      print('classification (labelled)')
      # <100k instances
      # yes:
      # --> LinearSVC, KNeighbors, SVC, Ensemble Classifiers
      # no:
      # --> SGD Classifier, Kernel Approximation
      num_records = self.df.shape[0]
      if num_records < big_dataset_size:
        print("< 100k")
        self.init_linearSVC()
        self.init_knn()
        self.init_SVC()
        self.init_RandomForestClassifier()
        # Start a LinearSVC, KNeighbors, SVC, Ensemble Classifier
      else:
        print("> 100k")
        self.init_SGDClassifier()
        # Start an SGD Classifier, Kernel Approximation.
    elif self.task == 'regression':
      print('regression')
      num_records = self.df.shape[0]
      if num_records < big_dataset_size:
        print("< 100k")
        self.init_SVR(kernel='linear')
        self.init_SVR(kernel='rbf')
        self.init_RandomForestRegressor()
        # Ridge, linearSVR, rbfSVR, Ensemble Regressors

      else:
        print("> 100k")

  def init_linearSVC(self):
    model = LinearSVC(random_state=0, max_iter=2000)
    model.fit(self.X_train, self.y_train)
    y_pred = model.predict(self.X_test )
    accuracy = accuracy_score(self.y_test, y_pred)
    self.models.append((model, accuracy))

  def init_knn(self, improvement_threshold=0.03, not_improved_number=2):
    best_model = None
    best_accuracy = 0
    not_improved = 0
    i = 1

    while not_improved < not_improved_number:
      not_improved += 1
      if self.verbose == 1:
        print(f"KNN: trying {i} neighbors, currently at acc {best_accuracy}")
      model = KNeighborsClassifier(n_neighbors=i)
      model.fit(self.X_train, self.y_train)
      y_pred = model.predict(self.X_val)
      accuracy = accuracy_score(self.y_val, y_pred)
      if accuracy  > best_accuracy + improvement_threshold:
        best_model = model
        best_accuracy = accuracy
        not_improved = 0
      i+=2

    y_pred = best_model.predict(self.X_test)
    accuracy = accuracy_score(self.y_test, y_pred)
    self.models.append((best_model, accuracy))

  def init_SVC(self):
    model = SVC(gamma='auto', max_iter=2000)
    model.fit(self.X_train, self.y_train)
    y_pred = model.predict(self.X_test)
    accuracy = accuracy_score(self.y_test, y_pred)
    self.models.append((model, accuracy))

  def init_RandomForestClassifier(self, improvement_threshold=0.03, not_improved_number=2):
    best_model = None
    best_accuracy = 0
    not_improved = 0
    i = 1

    while not_improved < not_improved_number:
      not_improved += 1
      if self.verbose:
        print(f"RFC: trying depth {i}, currently  at acc {best_accuracy}")
      model = RandomForestClassifier(max_depth=i, random_state=0)
      model.fit(self.X_train, self.y_train)
      y_pred = model.predict(self.X_val)
      accuracy = accuracy_score(self.y_val, y_pred)
      if accuracy > best_accuracy + improvement_threshold:
        best_model = model
        best_accuracy = accuracy
        not_improved = 0
      i += 2

    y_pred = best_model.predict(self.X_test)
    accuracy = accuracy_score(self.y_test, y_pred)
    self.models.append((model, accuracy))

  def init_SGDClassifier(self):
    model = SGDClassifier(max_iter=2000)
    model.fit(self.X_train, self.y_train)
    y_pred = model.predict(self.X_test)
    accuracy = accuracy_score(self.y_test, y_pred)
    self.models.append((model, accuracy))

  def init_SVR(self, kernel='linear'):
    model = SVR(kernel=kernel)
    model.fit(self.X_train, self.y_train)
    y_pred = model.predict(self.X_test)
    cost = mse(self.y_test, y_pred)
    self.models.append((model, cost))

  def init_RandomForestRegressor(self, improvement_threshold=0.5, not_improved_number=2):
    best_model = None
    best_cost = float('inf')
    not_improved = 0
    i = 1

    while not_improved < not_improved_number:
      not_improved += 1
      if self.verbose:
        print(f"RFR: trying depth {i}, currently  at acc {best_cost}")
      model = RandomForestRegressor(max_depth=i, criterion="squared_error")
      model.fit(self.X_train, self.y_train)
      y_pred = model.predict(self.X_val)
      cost = mse(self.y_val, y_pred)
      if cost + improvement_threshold < best_cost:
        best_model = model
        best_cost = cost
        not_improved = 0
      i += 2

    y_pred = best_model.predict(self.X_test)
    cost = mse(self.y_test, y_pred)
    self.models.append((best_model, cost))


In [None]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [None]:
daml = DAML(df, "I want to classify future data based off its price range", verbose = 1)
for model in daml.models:
  print(model[0], model[1])

{'task': 'classification (labelled)', 'targetColumn': 'price_range'}
classification (labelled) price_range
classification (labelled)
price_range
classification (labelled)
< 100k
KNN: trying 1 neighbors, currently at acc 0
KNN: trying 3 neighbors, currently at acc 0.4675
KNN: trying 5 neighbors, currently at acc 0.4675
KNN: trying 7 neighbors, currently at acc 0.5275
KNN: trying 9 neighbors, currently at acc 0.5275
KNN: trying 11 neighbors, currently at acc 0.5275
KNN: trying 13 neighbors, currently at acc 0.585
KNN: trying 15 neighbors, currently at acc 0.585
KNN: trying 17 neighbors, currently at acc 0.585
KNN: trying 19 neighbors, currently at acc 0.585
KNN: trying 21 neighbors, currently at acc 0.585
RFC: trying depth 1, currently  at acc 0
RFC: trying depth 3, currently  at acc 0.7075
RFC: trying depth 5, currently  at acc 0.7925
RFC: trying depth 7, currently  at acc 0.825
RFC: trying depth 9, currently  at acc 0.825
RFC: trying depth 11, currently  at acc 0.8575
RFC: trying depth

In [None]:
daml = DAML(df, "I want to classify future data based off the price_range column", verbose = 1)
for model in daml.models:
  print(model[0], model[1])

In [None]:
df = pd.read_csv("possum.csv")
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [None]:
daml = DAML(df, "I want to predict future data based off the totlngth column", verbose=1)
for model in daml.models:
  print(model[0], model[1])

{'task': 'regression', 'targetColumn': 'totlngth'}
regression totlngth
regression
totlngth
regression
< 100k
RFR: trying depth 1, currently  at acc inf
RFR: trying depth 3, currently  at acc 8.876018109563896
RFR: trying depth 5, currently  at acc 7.417759489493173
RFR: trying depth 7, currently  at acc 6.434191499155776
RFR: trying depth 9, currently  at acc 6.434191499155776
RFR: trying depth 11, currently  at acc 6.434191499155776
RFR: trying depth 13, currently  at acc 6.434191499155776
RFR: trying depth 15, currently  at acc 6.434191499155776
SVR(kernel='linear') 5.76065704708543
RandomForestRegressor(max_depth=5) 6.271237210594025
SVR() 7.5232322604040025


In [None]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
daml = DAML(df, "I want to predict whether a patient died or not", verbose=1)
for model in daml.models:
  print(model[0], model[1])

TASK: classification (labelled)
TARGET COLUMN: DEATH_EVENT
classification (labelled)
< 100k
KNN: trying 1 neighbors, currently at acc 0
KNN: trying 3 neighbors, currently at acc 0.7
KNN: trying 5 neighbors, currently at acc 0.8
KNN: trying 7 neighbors, currently at acc 0.8333333333333334
KNN: trying 9 neighbors, currently at acc 0.8333333333333334
RFC: trying depth 1, currently  at acc 0
RFC: trying depth 3, currently  at acc 0.8666666666666667
RFC: trying depth 5, currently  at acc 0.9
RFC: trying depth 7, currently  at acc 0.9
LinearSVC(max_iter=2000, random_state=0) 0.7833333333333333
SVC(gamma='auto', max_iter=2000) 0.7333333333333333
RandomForestClassifier(max_depth=7, random_state=0) 0.7166666666666667
KNeighborsClassifier() 0.7


In [None]:
m = pickle.load(open('model.sav', 'rb'))