# 0.0 import data

In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection      import train_test_split
from sklearn.preprocessing        import StandardScaler
from sklearn.neighbors            import KNeighborsClassifier
from sklearn.tree                 import DecisionTreeClassifier
from sklearn.ensemble             import RandomForestClassifier
from sklearn.linear_model         import LogisticRegression
from sklearn.metrics              import accuracy_score, precision_score, recall_score, f1_score

# 1.0 load dataset

In [2]:
X_train = pd.read_csv('../projetoaluno/dataclass/X_training.csv')
y_train = pd.read_csv('../projetoaluno/dataclass/y_training.csv')

X_val = pd.read_csv('../projetoaluno/dataclass/X_validation.csv')
y_val = pd.read_csv('../projetoaluno/dataclass/y_validation.csv')

X_test = pd.read_csv('../projetoaluno/dataclass/X_test.csv')
y_test = pd.read_csv('../projetoaluno/dataclass/y_test.csv')

# 2.0 params definition

In [7]:
params = {
    'KNN': {'n_neighbors': 5},
    'Decision Tree': {'max_depth': 100},
    'Random Forest': {'n_estimators': 100, 'max_depth': 100},
    'Logistic Regression': {'C': 1.0, 'solver': 'liblinear', 'max_iter': 1000}
}

# 3.0 model training and prediciton

## 3.1 train dataset

In [4]:
results = []

# KNN
knn = KNeighborsClassifier(n_neighbors=params['KNN']['n_neighbors'])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
results.append({
    'Model': 'KNN',
    'Accuracy': accuracy_score(y_train, y_pred),
    'Precision': precision_score(y_train, y_pred),
    'Recall': recall_score(y_train, y_pred),
    'F1-Score': f1_score(y_train, y_pred)
})

# Decision Tree
dt = DecisionTreeClassifier(max_depth=params['Decision Tree']['max_depth'])
dt.fit(X_train, y_train)
y_pred = dt.predict(X_train)
results.append({
    'Model': 'Decision Tree',
    'Accuracy': accuracy_score(y_train, y_pred),
    'Precision': precision_score(y_train, y_pred),
    'Recall': recall_score(y_train, y_pred),
    'F1-Score': f1_score(y_train, y_pred)
})

# Random Forest
rf = RandomForestClassifier(n_estimators=params['Random Forest']['n_estimators'],
                             max_depth=params['Random Forest']['max_depth'])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)
results.append({
    'Model': 'Random Forest',
    'Accuracy': accuracy_score(y_train, y_pred),
    'Precision': precision_score(y_train, y_pred),
    'Recall': recall_score(y_train, y_pred),
    'F1-Score': f1_score(y_train, y_pred)
})

# Logistic Regression
lr = LogisticRegression(C=params['Logistic Regression']['C'],
                         solver=params['Logistic Regression']['solver'],
                         max_iter=params['Logistic Regression']['max_iter'])

lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
results.append({
    'Model': 'Logistic Regression',
    'Accuracy': accuracy_score(y_train, y_pred),
    'Precision': precision_score(y_train, y_pred),
    'Recall': recall_score(y_train, y_pred),
    'F1-Score': f1_score(y_train, y_pred)
})


  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


In [5]:
data_results = pd.DataFrame(results)

In [6]:
data_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,KNN,0.781562,0.755893,0.732563,0.744046
1,Decision Tree,1.0,1.0,1.0,1.0
2,Random Forest,1.0,1.0,1.0,1.0
3,Logistic Regression,0.793132,0.729216,0.831424,0.776973


## 3.2 test dataset

In [8]:
results = []

# KNN
knn = KNeighborsClassifier(n_neighbors=params['KNN']['n_neighbors'])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
results.append({
    'Model': 'KNN',
    'Accuracy': accuracy_score(y_test, y_pred), 
    'Precision': precision_score(y_test, y_pred),  
    'Recall': recall_score(y_test, y_pred),  
    'F1-Score': f1_score(y_test, y_pred)  
})

# Decision Tree
dt = DecisionTreeClassifier(max_depth=params['Decision Tree']['max_depth'])
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)  
results.append({
    'Model': 'Decision Tree',
    'Accuracy': accuracy_score(y_test, y_pred),  
    'Precision': precision_score(y_test, y_pred),  
    'Recall': recall_score(y_test, y_pred),  
    'F1-Score': f1_score(y_test, y_pred)  
})

# Random Forest
rf = RandomForestClassifier(n_estimators=params['Random Forest']['n_estimators'],
                             max_depth=params['Random Forest']['max_depth'])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)  
results.append({
    'Model': 'Random Forest',
    'Accuracy': accuracy_score(y_test, y_pred),  
    'Precision': precision_score(y_test, y_pred),  
    'Recall': recall_score(y_test, y_pred),  
    'F1-Score': f1_score(y_test, y_pred) 
})

# Logistic Regression
lr = LogisticRegression(C=params['Logistic Regression']['C'],
                         solver=params['Logistic Regression']['solver'],
                         max_iter=params['Logistic Regression']['max_iter'])
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test) 
results.append({
    'Model': 'Logistic Regression',
    'Accuracy': accuracy_score(y_test, y_pred),  
    'Precision': precision_score(y_test, y_pred),  
    'Recall': recall_score(y_test, y_pred),  
    'F1-Score': f1_score(y_test, y_pred)
})

  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


In [9]:
data_results_test = pd.DataFrame(results)

In [10]:
data_results_test

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,KNN,0.670529,0.630816,0.601232,0.615669
1,Decision Tree,0.945043,0.935823,0.939199,0.937508
2,Random Forest,0.963426,0.972001,0.943863,0.957725
3,Logistic Regression,0.794655,0.733586,0.835636,0.781292


## 3.3 validation dataset

In [52]:
results = []

# KNN
knn = KNeighborsClassifier(n_neighbors=params['KNN']['n_neighbors'])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_val)
results.append({
    'Model': 'KNN',
    'Accuracy': accuracy_score(y_val, y_pred),
    'Precision': precision_score(y_val, y_pred),
    'Recall': recall_score(y_val, y_pred),
    'F1-Score': f1_score(y_val, y_pred)
})

# Decision Tree
dt = DecisionTreeClassifier(max_depth=params['Decision Tree']['max_depth'])
dt.fit(X_train, y_train)
y_pred = dt.predict(X_val)
results.append({
    'Model': 'Decision Tree',
    'Accuracy': accuracy_score(y_val, y_pred),
    'Precision': precision_score(y_val, y_pred),
    'Recall': recall_score(y_val, y_pred),
    'F1-Score': f1_score(y_val, y_pred)
})

# Random Forest
rf = RandomForestClassifier(n_estimators=params['Random Forest']['n_estimators'],
                             max_depth=params['Random Forest']['max_depth'])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
results.append({
    'Model': 'Random Forest',
    'Accuracy': accuracy_score(y_val, y_pred),
    'Precision': precision_score(y_val, y_pred),
    'Recall': recall_score(y_val, y_pred),
    'F1-Score': f1_score(y_val, y_pred)
})

# Logistic Regression
lr = LogisticRegression(C=params['Logistic Regression']['C'],
                         solver=params['Logistic Regression']['solver'],
                         max_iter=params['Logistic Regression']['max_iter'])
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
results.append({
    'Model': 'Logistic Regression',
    'Accuracy': accuracy_score(y_val, y_pred),
    'Precision': precision_score(y_val, y_pred),
    'Recall': recall_score(y_val, y_pred),
    'F1-Score': f1_score(y_val, y_pred)
})


In [53]:
data_results_val = pd.DataFrame(results)

In [54]:
data_results_val

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,KNN,0.675665,0.631775,0.603163,0.617138
1,Decision Tree,0.906689,0.906531,0.874898,0.890434
2,Random Forest,0.924611,0.918522,0.906452,0.912447
3,Logistic Regression,0.794202,0.731068,0.830722,0.777716
