# THESE ALGORITHMS ARE USED TO HELP CONVERT DATASETS FOR OPF OR OTHERS ALGORITHMS PROCESSING

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

#### FORMAT .DAT TO .TXT

In [None]:
def format_df_to_txt(df, out_txt, target_group, generate_labels, n_features):
    def add_labels(dataset):
          df_with_labels = pd.DataFrame(dataset)
          df_with_labels.insert(0, 'Label', range(len(dataset)))
          df_with_labels['Label'] = df_with_labels['Label'].astype(int)

          return df_with_labels.reset_index(drop=True)

    if generate_labels and "Label" not in df:
      df = add_labels(df)

    groups = df.groupby(target_group)
    df[target_group] = groups.ngroup().astype(int) + 1

    data = df.values.tolist()

    with open(out_txt, 'w') as txt_file:
        txt_file.write(f"{len(df) - 1} {groups.ngroups} {n_features}\n")
        for i, row in enumerate(data):
            formatted_row = f"{int(row[0])} {int(row[1])} {' '.join(map(str, row[2:]))}"
            txt_file.write(f"{formatted_row}")

            if i < len(data) - 1:
                txt_file.write("\n")

    print(f"Dataset successfully converted, with {len(df) - 1} samples, {groups.ngroups} classes and {n_features} features!")

#### LOAD DATASET

In [None]:
df = pd.read_csv("US_honey_dataset_updated.csv")

In [None]:
df

Unnamed: 0,state,colonies_number,yield_per_colony,production,stocks,average_price,value_of_production,year
0,Alabama,16000,58,928000,28000,62.00,575000,1995
1,Arizona,52000,79,4108000,986000,68.00,2793000,1995
2,Arkansas,50000,60,3000000,900000,64.00,1920000,1995
3,California,420000,93,39060000,4687000,60.00,23436000,1995
4,Colorado,45000,60,2700000,1404000,68.00,1836000,1995
...,...,...,...,...,...,...,...,...
1110,Virginia,6000,40,79000,79000,8.23,1975000,2021
1111,Washington,96000,32,1206000,1206000,2.52,7741000,2021
1112,WestVirginia,6000,43,136000,136000,4.80,1238000,2021
1113,Wisconsin,42000,47,750000,750000,2.81,5547000,2021


In [None]:
format_df_to_txt(df, "honey.txt", "state", False, 7)

Dataset successfully converted, with 1115 samples, 44 classes and 7 features!


#### FORMAT .TXT TO .CSV


In [None]:
def convert_txt_to_df(txt_file):
    data_list = []

    with open(txt_file, 'r') as txt_file:
        # Skip the first line
        next(txt_file)
        for line in txt_file:
            values = list(map(float, line.strip().split()))
            data_list.append(values)

    columns = [f'Column_{i}' for i in range(1, len(data_list[0]) + 1)]
    df = pd.DataFrame(data_list, columns=columns)

    return df

## USAGES EXAMPLES

##### LOAD DATAFRAMES

In [None]:
boat = convert_txt_to_df("boat.txt")
cone = convert_txt_to_df("cone.txt")
data1 = convert_txt_to_df("data1.txt")
data2 = convert_txt_to_df("data2.txt")
data3 = convert_txt_to_df("data3.txt")

#### SVM MODEL

In [None]:
def calc_svc_metrics(df, target_col):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    sum_f1 = 0;
    acc = []

    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        svm_classifier = SVC(kernel='poly', C=0.1, random_state=42)

        svm_classifier.fit(X_train, y_train)

        y_pred = svm_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        f1 = f1_score(y_test, y_pred, average='weighted')

        sum_f1 = sum_f1 + f1

        acc.append(accuracy)

    mean_f1 = sum_f1 / 10
    mean_acc = np.mean(acc)
    diferencas = acc - mean_acc
    square_diff = diferencas ** 2
    mean_aquare_diff = np.mean(square_diff)
    standard_deviation = np.sqrt(mean_aquare_diff)

    print(f'Mean Acc: {mean_acc:.5f}')
    print(f'Standard deviation: {standard_deviation:.5f}')
    print(f'Mean F1: {mean_f1:.5f}')

##### CALCULATE SVC METRICS

In [None]:
calc_svc_metrics(boat, "Column_2")

Mean Acc: 0.77000
Standard deviation: 0.10296
Mean F1: 0.76727


In [None]:
calc_svc_metrics(cone, "Column_2")

Mean Acc: 0.84000
Standard deviation: 0.02550
Mean F1: 0.81653


In [None]:
calc_svc_metrics(data1, "Column_2")

Mean Acc: 0.95404
Standard deviation: 0.01081
Mean F1: 0.95373


In [None]:
calc_svc_metrics(data2, "Column_2")

Mean Acc: 0.93158
Standard deviation: 0.02982
Mean F1: 0.93121


In [None]:
calc_svc_metrics(data3, "Column_2")

Mean Acc: 0.91618
Standard deviation: 0.02794
Mean F1: 0.91415


#### NAIVE BAYES MODEL

In [None]:
def calc_bayes_metrics(df, target_col):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    sum_f1 = 0;
    acc = []

    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        bayes_classifier = GaussianNB()

        bayes_classifier.fit(X_train, y_train)

        y_pred = bayes_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        f1 = f1_score(y_test, y_pred, average='weighted')

        sum_f1 = sum_f1 + f1

        acc.append(accuracy)

    mean_f1 = sum_f1 / 10
    mean_acc = np.mean(acc)
    diferencas = acc - mean_acc
    square_diff = diferencas ** 2
    mean_aquare_diff = np.mean(square_diff)
    standard_deviation = np.sqrt(mean_aquare_diff)

    print(f'Mean Acc: {mean_acc:.5f}')
    print(f'Standard deviation: {standard_deviation:.5f}')
    print(f'Mean F1: {mean_f1:.5f}')

##### CALCULATE NAIVE BAYES METRICS

In [None]:
calc_bayes_metrics(boat, "Column_2")

Mean Acc: 1.00000
Standard deviation: 0.00000
Mean F1: 1.00000


In [None]:
calc_bayes_metrics(cone, "Column_2")

Mean Acc: 0.98500
Standard deviation: 0.01458
Mean F1: 0.98499


In [None]:
calc_bayes_metrics(data1, "Column_2")

Mean Acc: 0.92632
Standard deviation: 0.01438
Mean F1: 0.92639


In [None]:
calc_bayes_metrics(data2, "Column_2")

Mean Acc: 0.98070
Standard deviation: 0.01457
Mean F1: 0.98066


In [None]:
calc_bayes_metrics(data3, "Column_2")

Mean Acc: 0.99559
Standard deviation: 0.00674
Mean F1: 0.99541


#### KNN MODEL

In [None]:
def calc_knn_metrics(df, target_col):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    sum_f1 = 0;
    acc = []

    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        knn_classifier = KNeighborsClassifier()

        knn_classifier.fit(X_train, y_train)

        y_pred = knn_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        f1 = f1_score(y_test, y_pred, average='weighted')

        sum_f1 = sum_f1 + f1

        acc.append(accuracy)

    mean_f1 = sum_f1 / 10
    mean_acc = np.mean(acc)
    diferencas = acc - mean_acc
    square_diff = diferencas ** 2
    mean_aquare_diff = np.mean(square_diff)
    standard_deviation = np.sqrt(mean_aquare_diff)

    print(f'Mean Acc: {mean_acc:.5f}')
    print(f'Standard deviation: {standard_deviation:.5f}')
    print(f'Mean F1: {mean_f1:.5f}')

In [None]:
calc_knn_metrics(boat, "Column_2")

Mean Acc: 0.99000
Standard deviation: 0.02000
Mean F1: 0.99013


In [None]:
calc_knn_metrics(cone, "Column_2")

Mean Acc: 0.97875
Standard deviation: 0.02240
Mean F1: 0.97905


In [None]:
calc_knn_metrics(data1, "Column_2")

Mean Acc: 0.99368
Standard deviation: 0.00409
Mean F1: 0.99369


In [None]:
calc_knn_metrics(data2, "Column_2")

Mean Acc: 0.98947
Standard deviation: 0.00859
Mean F1: 0.98947


In [None]:
calc_knn_metrics(data3, "Column_2")

Mean Acc: 0.99118
Standard deviation: 0.00720
Mean F1: 0.99097
