# 1. Classification

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# load dataset function
def load_dataset(filename):
    #load data from csv file
    df_data= pd.read_csv(filename, header= None)
    #convert df to numpy array
    dataset= df_data.values
    # split dataset into input (X) and output (y)
    X = dataset[:, :-1]
    y = dataset[:, -1]
    # format X to str
    X= X.astype(str)
    
    return X, y

In [3]:
# prepare input function
def prepare_input(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc= oe.transform(X_train)
    X_test_enc= oe.transform(X_test)
    
    return X_train_enc, X_test_enc

In [4]:
# prepare output function
def prepare_output(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc= le.transform(y_train)
    y_test_enc= le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [5]:
# load dataset
X, y= load_dataset('breast-cancer.csv')
print(X.shape)

(286, 9)


In [6]:
# split X,y into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 1)

In [7]:
# prepare input data
X_train_enc, X_test_enc= prepare_input(X_train, X_test)

In [8]:
# prepare output data
y_train_enc, y_test_enc= prepare_output(y_train, y_test)

### 1.1 DecisionTreeClassifier

In [9]:
# declare obj of DT model
clf= DecisionTreeClassifier()

# fit model on train set
clf.fit(X_train_enc, y_train_enc)

DecisionTreeClassifier()

In [10]:
#Evaluate model

# predict on test data
y_pred= clf.predict(X_test_enc)

#model accuracy
acc= metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acc: {acc}')
#model precision
pres= metrics.precision_score(y_test_enc, y_pred)
print(f'Model pre: {pres}')
#model recall
rec= metrics.recall_score(y_test_enc, y_pred)
print(f'Model rec: {rec}')

Model acc: 0.686046511627907
Model pre: 0.6470588235294118
Model rec: 0.34375


In [11]:
# use cross validation
# declare obj of DT model
clf= DecisionTreeClassifier()

cv= KFold(n_splits= 3, shuffle= True, random_state= 1)

# encode data
oe= OrdinalEncoder()
X_enc= oe.fit_transform(X)
le = LabelEncoder()
y_enc= le.fit_transform(y)

# evaluate model
result= cross_val_score(clf, X_enc, y_enc, cv=cv, scoring= 'accuracy')
print(result.mean())

0.6536549707602339


In [12]:
# use repeat cross validation

# declare obj of DT model
clf= DecisionTreeClassifier()

# define the model evaluation precedure
cv= RepeatedStratifiedKFold(n_splits= 5, n_repeats= 10, random_state=1)

# encode data
oe= OrdinalEncoder()
X_enc= oe.fit_transform(X)
le = LabelEncoder()
y_enc= le.fit_transform(y)

# evaluate model
scores= cross_val_score(clf, X_enc, y_enc, cv=cv, scoring= 'accuracy', n_jobs= -1)
print(scores.mean())

0.6499758015728978


### 1.2 KNeighborsClassifier

In [16]:
# Train model
# declare obj of KC model
neigh = KNeighborsClassifier()

# fit model on train set
neigh.fit(X_train_enc, y_train_enc)

KNeighborsClassifier()

In [18]:
# Evaluate model

# predict on test data
y_pred= neigh.predict(X_test_enc)

#model accuracy
acc= metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acc: {acc}')
#model precision
pres= metrics.precision_score(y_test_enc, y_pred)
print(f'Model pre: {pres}')
#model recall
rec= metrics.recall_score(y_test_enc, y_pred)
print(f'Model rec: {rec}')

Model acc: 0.6627906976744186
Model pre: 0.6363636363636364
Model rec: 0.21875


In [19]:
# use cross validation
# declare obj of KC model
neigh = KNeighborsClassifier()

cv= KFold(n_splits= 3, shuffle= True, random_state= 1)

# encode data
oe= OrdinalEncoder()
X_enc= oe.fit_transform(X)
le = LabelEncoder()
y_enc= le.fit_transform(y)

# evaluate model
result= cross_val_score(neigh, X_enc, y_enc, cv=cv, scoring= 'accuracy')
print(result.mean())

0.6678362573099416


# 2. Regression

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

### 2.1 DecisionTreeRegressor

In [21]:
# load dataset function
def load_dataset_scale(filename):
    #load data from csv file
    df_data= pd.read_csv(filename, header= None)
    #convert df to numpy array
    dataset= df_data.values
    # split dataset into input (X) and output (y)
    X = dataset[:, :-1]
    y = dataset[:, -1]
    
    scaler_s = StandardScaler()

    X= scaler_s.fit_transform(X)
    
    return X, y

In [23]:
# load data
X, y= load_dataset_scale('housing.csv')
print(X.shape)

(506, 13)


In [24]:
# split X,y into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 1)

In [25]:
# declare model obj
Dtr_ml= DecisionTreeRegressor()

# fit model on train set
Dtr_ml.fit(X_train, y_train)

DecisionTreeRegressor()

In [26]:
# Evaluate model

# predict on test data
y_pred= Dtr_ml.predict(X_test)

#model mse
mse= mean_squared_error(y_test, y_pred)
print(f'Model mse: {mse}')
#model mae
mae= mean_absolute_error(y_test, y_pred)
print(f'Model mae: {mae}')

Model mse: 16.558092105263157
Model mae: 2.8743421052631577


### 2.2 RandomForestRegressor

In [27]:
# load dataset function
def load_dataset_scale(filename):
    #load data from csv file
    df_data= pd.read_csv(filename, header= None)
    #convert df to numpy array
    dataset= df_data.values
    # split dataset into input (X) and output (y)
    X = dataset[:, :-1]
    y = dataset[:, -1]
    
    scaler_s = StandardScaler()

    X= scaler_s.fit_transform(X)
    
    return X, y

In [28]:
# load data
X, y= load_dataset_scale('housing.csv')
print(X.shape)

(506, 13)


In [29]:
# split X,y into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 1)

In [30]:
# declare model obj
Rf_ml= RandomForestRegressor()

# fit model on train set
Rf_ml.fit(X_train, y_train)

RandomForestRegressor()

In [31]:
# Evaluate model

# predict on test data
y_pred= Rf_ml.predict(X_test)

#model mse
mse= mean_squared_error(y_test, y_pred)
print(f'Model mse: {mse}')
#model mae
mae= mean_absolute_error(y_test, y_pred)
print(f'Model mae: {mae}')

Model mse: 7.898784223684213
Model mae: 2.151539473684211
