# Lab Models for Regression and Classification

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## Regression models

## Classification Models

In [2]:
dfB = pd.read_csv("../data/dfIPSA_clean.csv")
dfB.head(5)

Unnamed: 0,NOME,FAZ,TAL,dosismad,semsmad,edad,cortes,vejez,sacarosa,mes,periodo,TCH,lluvias,grupo_tenencia,pct_diatrea,sacarosa_nivel,TCH_nivel
0,AMAIME SILCA,81291,40,0.8,8.3,12.3,4,2.4,14.0,12,202012,112,137,3,6.2,Alto,Bajo
1,AMAIME SILCA,81291,41,0.8,6.3,11.2,2,2.3,13.0,3,201903,157,0,3,3.5,Medio,Alto
2,AMAIME SILCA,81291,41,0.6,7.9,12.2,3,1.8,13.3,3,202003,167,68,3,4.3,Alto,Alto
3,AMAIME SILCA,81291,43,0.8,6.6,13.1,1,2.5,13.4,3,201903,156,0,3,3.5,Alto,Alto
4,AMAIME SILCA,81291,43,0.6,8.1,12.2,2,2.1,14.0,3,202003,151,68,3,4.3,Alto,Medio


In [None]:
targetSac = dfB['sacarosa_nivel']
dfB.drop('sacarosa_nivel', axis=1, inplace=True)
targetSac.value_counts()

sacarosa_nivel
Bajo     761
Alto     716
Medio    710
Name: count, dtype: int64

In [7]:
targetTCH = dfB['TCH_nivel']
dfB.drop('TCH_nivel', axis=1, inplace=True)
targetTCH.value_counts()

TCH_nivel
Bajo     754
Alto     728
Medio    705
Name: count, dtype: int64

In [11]:
dfB.drop(['sacarosa', 'TCH'], axis=1, inplace=True) #Take out the original predictor continous variables

In [15]:
dfB.drop(['NOME', 'FAZ', 'TAL'], axis=1, inplace=True) #Drop the identifier columns

In [16]:
dfB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2187 entries, 0 to 2186
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   dosismad        2187 non-null   float64
 1   semsmad         2187 non-null   float64
 2   edad            2187 non-null   float64
 3   cortes          2187 non-null   int64  
 4   vejez           2187 non-null   float64
 5   mes             2187 non-null   int64  
 6   periodo         2187 non-null   int64  
 7   lluvias         2187 non-null   int64  
 8   grupo_tenencia  2187 non-null   int64  
 9   pct_diatrea     2187 non-null   float64
dtypes: float64(5), int64(5)
memory usage: 171.0 KB


In [18]:
from sklearn.linear_model import LogisticRegression

### Logistic Regression

In [82]:
#Variables for the Sucrose models
X_trainSac, X_testSac, y_trainSac, y_testSac = train_test_split(dfB, targetSac, test_size=0.20, random_state=42)
scalerSac = StandardScaler()
X_trainSac = scalerSac.fit_transform(X_trainSac)
X_testSac = scalerSac.transform(X_testSac)

In [84]:
#models for the TCH models
X_trainTCH, X_testTCH, y_trainTCH, y_testTCH = train_test_split(dfB, targetTCH, test_size=0.20, random_state=42)
scalerTCH = StandardScaler()
X_trainTCH = scalerTCH.fit_transform(X_trainTCH)
X_testTCH = scalerTCH.transform(X_testTCH)

#### Sucrose Levels

In [83]:
solvers = ['lbfgs', 'newton-cg', 'saga']
C_values = [0.01, 0.1, 1, 10]

for solver in solvers:
    for c in C_values:
        print(f"\n--- Solver: {solver}, C: {c} ---")
        logModel = LogisticRegression(
            solver=solver,
            C=c,
            penalty='l2',
            max_iter=500,
            random_state=42
        )
        
        logModel.fit(X_trainSac, y_trainSac)

        y_pred = logModel.predict(X_testSac)

        print("Accuracy: ", accuracy_score(y_testSac, y_pred))
        print("Precision: ", precision_score(y_testSac, y_pred, average='macro'))
        print("Recall: ", recall_score(y_testSac, y_pred, average='macro'))
        print("F1 Score: ", f1_score(y_testSac, y_pred, average='macro'))
        print("Kappa Score: ", cohen_kappa_score(y_testSac, y_pred))


--- Solver: lbfgs, C: 0.01 ---
Accuracy:  0.4041095890410959
Precision:  0.39250799955864507
Recall:  0.39933265205004337
F1 Score:  0.39463325504780117
Kappa Score:  0.103219429539678

--- Solver: lbfgs, C: 0.1 ---
Accuracy:  0.4018264840182648
Precision:  0.38920491273432445
Recall:  0.39673913043478254
F1 Score:  0.39150920245398774
Kappa Score:  0.09927474804558722

--- Solver: lbfgs, C: 1 ---
Accuracy:  0.3995433789954338
Precision:  0.38729468852001186
Recall:  0.3946023782980304
F1 Score:  0.3896087445091429
Kappa Score:  0.09596459010218039

--- Solver: lbfgs, C: 10 ---
Accuracy:  0.3995433789954338
Precision:  0.38729468852001186
Recall:  0.3946023782980304
F1 Score:  0.3896087445091429
Kappa Score:  0.09596459010218039

--- Solver: newton-cg, C: 0.01 ---
Accuracy:  0.4041095890410959
Precision:  0.39250799955864507
Recall:  0.39933265205004337
F1 Score:  0.39463325504780117
Kappa Score:  0.103219429539678

--- Solver: newton-cg, C: 0.1 ---
Accuracy:  0.4018264840182648
Preci

#### TCH Levels

In [85]:
solvers = ['lbfgs', 'newton-cg', 'saga']
C_values = [0.01, 0.1, 1, 10]

for solver in solvers:
    for c in C_values:
        print(f"\n--- Solver: {solver}, C: {c} ---")
        logModel = LogisticRegression(
            solver=solver,
            C=c,
            penalty='l2',
            max_iter=500,
            random_state=42
        )
        
        logModel.fit(X_trainTCH, y_trainTCH)

        y_pred = logModel.predict(X_testTCH)

        print("Accuracy: ", accuracy_score(y_testTCH, y_pred))
        print("Precision: ", precision_score(y_testTCH, y_pred, average='macro'))
        print("Recall: ", recall_score(y_testTCH, y_pred, average='macro'))
        print("F1 Score: ", f1_score(y_testTCH, y_pred, average='macro'))
        print("Kappa Score: ", cohen_kappa_score(y_testTCH, y_pred))


--- Solver: lbfgs, C: 0.01 ---
Accuracy:  0.4634703196347032
Precision:  0.44908297720797724
Recall:  0.45113916960479744
F1 Score:  0.4436151653643914
Kappa Score:  0.18769828116861587

--- Solver: lbfgs, C: 0.1 ---
Accuracy:  0.4726027397260274
Precision:  0.4586486352581505
Recall:  0.4603805294048822
F1 Score:  0.45390946502057616
Kappa Score:  0.20170109356014587

--- Solver: lbfgs, C: 1 ---
Accuracy:  0.4680365296803653
Precision:  0.4541567088050926
Recall:  0.4557508997752526
F1 Score:  0.4499782610893723
Kappa Score:  0.195005363791254

--- Solver: lbfgs, C: 10 ---
Accuracy:  0.4680365296803653
Precision:  0.4541567088050926
Recall:  0.4557508997752526
F1 Score:  0.4499782610893723
Kappa Score:  0.195005363791254

--- Solver: newton-cg, C: 0.01 ---
Accuracy:  0.4634703196347032
Precision:  0.44908297720797724
Recall:  0.45113916960479744
F1 Score:  0.4436151653643914
Kappa Score:  0.18769828116861587

--- Solver: newton-cg, C: 0.1 ---
Accuracy:  0.4726027397260274
Precision: 

# De arriba solo elegi 1 de TCH y otro de Sac los que tengan el mejor f1 primero (más cercano a 2), luego kappa (más alto) y luego el resto. Pero creo que se repiten (Pero no estaba seguro del solver). Esos los metes al cross_val_score (metes esos parámetros)

In [87]:
#Este para SAC
logModel = LogisticRegression(solver=solver,C=c, penalty='l2', max_iter=500, random_state=42)
scores = cross_val_score(logModel, X_trainSac, y_trainSac, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.42512681187248447
Cross validation F1 Score std:  0.0324527737073013


In [88]:
#Este para TCH
logModel = LogisticRegression(solver=solver,C=c, penalty='l2', max_iter=500, random_state=42)
scores = cross_val_score(logModel, X_trainTCH, y_trainTCH, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.44794642518309136
Cross validation F1 Score std:  0.016276443405243802


### K-nearest Neighbors

In [72]:
from sklearn.neighbors import KNeighborsClassifier

#### Sucrose Levels

In [91]:
k_values = [3, 5, 7, 9, 10, 15, 20, 30, 100]

for k in k_values:
    print(f"\n--- KNN with k = {k} ---")

    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_trainSac, y_trainSac)
    y_pred = knn.predict(X_testSac)

    print("Accuracy:     ", accuracy_score(y_testSac, y_pred))
    print("Precision:    ", precision_score(y_testSac, y_pred, average='macro'))
    print("Recall:       ", recall_score(y_testSac, y_pred, average='macro'))
    print("F1 Score:     ", f1_score(y_testSac, y_pred, average='macro'))
    print("Kappa Score:  ", cohen_kappa_score(y_testSac, y_pred))


--- KNN with k = 3 ---
Accuracy:      0.4634703196347032
Precision:     0.45446097138250136
Recall:        0.4561810974854454
F1 Score:      0.45043433718059184
Kappa Score:   0.1897699900817078

--- KNN with k = 5 ---
Accuracy:      0.4589041095890411
Precision:     0.4358543540250055
Recall:        0.45117985878855443
F1 Score:      0.4382828993666432
Kappa Score:   0.1826814058956916

--- KNN with k = 7 ---
Accuracy:      0.4680365296803653
Precision:     0.4530504176944626
Recall:        0.46284683512944386
F1 Score:      0.4564039345945314
Kappa Score:   0.19916191911106929

--- KNN with k = 9 ---
Accuracy:      0.4657534246575342
Precision:     0.45330925169805814
Recall:        0.46098878979313757
F1 Score:      0.4544398934554699
Kappa Score:   0.19470111257778622

--- KNN with k = 10 ---
Accuracy:      0.4634703196347032
Precision:     0.4468281960209632
Recall:        0.45890623064536107
F1 Score:      0.44989816061800586
Kappa Score:   0.19171692423671305

--- KNN with k = 

#### TCH Levels

In [92]:
k_values = [3, 5, 7, 9, 10, 15, 20, 30, 100]

for k in k_values:
    print(f"\n--- KNN with k = {k} ---")

    knnModel = KNeighborsClassifier(n_neighbors=k)
    knnModel.fit(X_trainTCH, y_trainTCH)
    y_pred = knn.predict(X_testTCH)

    print("Accuracy:     ", accuracy_score(y_testTCH, y_pred))
    print("Precision:    ", precision_score(y_testTCH, y_pred, average='macro'))
    print("Recall:       ", recall_score(y_testTCH, y_pred, average='macro'))
    print("F1 Score:     ", f1_score(y_testTCH, y_pred, average='macro'))
    print("Kappa Score:  ", cohen_kappa_score(y_testTCH, y_pred))


--- KNN with k = 3 ---
Accuracy:      0.3242009132420091
Precision:     0.3297296791475126
Recall:        0.3226429904844728
F1 Score:      0.323478816121275
Kappa Score:   -0.022710599594538028

--- KNN with k = 5 ---
Accuracy:      0.3242009132420091
Precision:     0.3297296791475126
Recall:        0.3226429904844728
F1 Score:      0.323478816121275
Kappa Score:   -0.022710599594538028

--- KNN with k = 7 ---
Accuracy:      0.3242009132420091
Precision:     0.3297296791475126
Recall:        0.3226429904844728
F1 Score:      0.323478816121275
Kappa Score:   -0.022710599594538028

--- KNN with k = 9 ---
Accuracy:      0.3242009132420091
Precision:     0.3297296791475126
Recall:        0.3226429904844728
F1 Score:      0.323478816121275
Kappa Score:   -0.022710599594538028

--- KNN with k = 10 ---
Accuracy:      0.3242009132420091
Precision:     0.3297296791475126
Recall:        0.3226429904844728
F1 Score:      0.323478816121275
Kappa Score:   -0.022710599594538028

--- KNN with k = 1

# Aqui haces lo mismo con estos dos grupitos

In [93]:
knnModel = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knnModel, X_trainSac, y_trainSac, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.47000238037595005
Cross validation F1 Score std:  0.038573153299882805


In [94]:
knnModel = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knnModel, X_trainTCH, y_trainTCH, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.4341828231769007
Cross validation F1 Score std:  0.014579155806458952


## Random Forest

#### Sucrose Levels

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
n_estimators = [5, 10, 25, 50, 100, 150, 200]

for n in n_estimators:
    rfModel = RandomForestClassifier(n_estimators=n, random_state=42)
    rfModel.fit(X_trainSac, y_trainSac)
    y_pred = rfModel.predict(X_testSac)
    
    print(f"\n--- Random Forest with {n} trees ---")
    print("Accuracy:     ", accuracy_score(y_testSac, y_pred))
    print("Precision:    ", precision_score(y_testSac, y_pred, average='macro'))
    print("Recall:       ", recall_score(y_testSac, y_pred, average='macro'))
    print("F1 Score:     ", f1_score(y_testSac, y_pred, average='macro'))
    print("Kappa Score:  ", cohen_kappa_score(y_testSac, y_pred))


--- Random Forest with 5 trees ---
Accuracy:      0.4840182648401826
Precision:     0.4656637078599175
Recall:        0.4787718320327016
F1 Score:      0.469509538928852
Kappa Score:   0.223550451807229

--- Random Forest with 10 trees ---
Accuracy:      0.5091324200913242
Precision:     0.4949972135817353
Recall:        0.504304471695776
F1 Score:      0.4977508840333864
Kappa Score:   0.26203686289260864

--- Random Forest with 25 trees ---
Accuracy:      0.5365296803652968
Precision:     0.5254373497569792
Recall:        0.532608695652174
F1 Score:      0.5279016555290307
Kappa Score:   0.3032582631999625

--- Random Forest with 50 trees ---
Accuracy:      0.547945205479452
Precision:     0.5326462750197246
Recall:        0.5435247120029728
F1 Score:      0.5349238249011514
Kappa Score:   0.3200677392040643

--- Random Forest with 100 trees ---
Accuracy:      0.541095890410959
Precision:     0.5287583365192643
Recall:        0.5371376811594203
F1 Score:      0.5309039918414918
Kapp

#### TCH LEvels

In [99]:
for n in n_estimators:
    rfModel = RandomForestClassifier(n_estimators=n, random_state=42)
    rfModel.fit(X_trainTCH, y_trainTCH)
    y_pred = rfModel.predict(X_testTCH)
    
    print(f"\n--- Random Forest with {n} trees ---")
    print("Accuracy:     ", accuracy_score(y_testTCH, y_pred))
    print("Precision:    ", precision_score(y_testTCH, y_pred, average='macro'))
    print("Recall:       ", recall_score(y_testTCH, y_pred, average='macro'))
    print("F1 Score:     ", f1_score(y_testTCH, y_pred, average='macro'))
    print("Kappa Score:  ", cohen_kappa_score(y_testTCH, y_pred))


--- Random Forest with 5 trees ---
Accuracy:      0.45662100456621
Precision:     0.4353052240317878
Recall:        0.4406789002921541
F1 Score:      0.4352431688048126
Kappa Score:   0.1725484593037102

--- Random Forest with 10 trees ---
Accuracy:      0.4771689497716895
Precision:     0.4661650518793376
Recall:        0.46653259535172875
F1 Score:      0.4663222658232729
Kappa Score:   0.210959809312534

--- Random Forest with 25 trees ---
Accuracy:      0.5136986301369864
Precision:     0.5085226222914613
Recall:        0.5047643121456126
F1 Score:      0.5062296082448873
Kappa Score:   0.26775973439866896

--- Random Forest with 50 trees ---
Accuracy:      0.5228310502283106
Precision:     0.5173407169780782
Recall:        0.5146522353368503
F1 Score:      0.5156191545080434
Kappa Score:   0.28196721311475403

--- Random Forest with 100 trees ---
Accuracy:      0.5228310502283106
Precision:     0.5226425982893657
Recall:        0.5184063194695474
F1 Score:      0.5191294503808672

# El ultimo de esto

In [100]:
rfModel = RandomForestClassifier(n_estimators=n, random_state=42)
scores = cross_val_score(rfModel, X_trainSac, y_trainSac, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.5470370357867519
Cross validation F1 Score std:  0.01854838768123602


In [101]:
rfModel = RandomForestClassifier(n_estimators=n, random_state=42)
scores = cross_val_score(knnModel, X_trainTCH, y_trainTCH, cv=5, scoring='f1_macro')
print("Cross validation F1 Score: ", scores.mean())
print("Cross validation F1 Score std: ", scores.std())

Cross validation F1 Score:  0.4341828231769007
Cross validation F1 Score std:  0.014579155806458952
