In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#from catboost import CatBoostClassifier

In [17]:
import os
import joblib

In [2]:
# Load the data
train_data_path = '../data/clean/train_data.csv'
test_data_path = '../data/clean/test_data.csv'
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
# Assuming 'reference' is the label column and all other columns are features
label_col = 'reference'

In [4]:
# Define features and labels
X_train = train_df.drop(columns=[label_col])
y_train = train_df[label_col]
X_test = test_df.drop(columns=[label_col])
y_test = test_df[label_col]

In [5]:
X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [12]:
# Define parameter grids for each classifier
param_grid_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [10, 20, 50],
    'max_depth': [3, 6],
    'min_samples_split': [2, 6],
    'min_samples_leaf': [1, 2]
}

param_grid_lr = {
    'C': [0.1, 1, 5],
    'solver': ['liblinear', 'saga']
}

In [14]:
# Initialize classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    #'Random Forest': RandomForestClassifier(),
    #'Logistic Regression': LogisticRegression(),
}

In [20]:
# Train and evaluate each classifier with progress tracking
results = {}
for name, clf in tqdm(classifiers.items(), desc="Training classifiers"):
    print(f"Training {name}...")
    clf.fit(X_train, y_train)
    print(f"Finished training {name}. Predicting...")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'report': report
    }
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("="*60)
    # Define the model save path without the model filename
   
    model_save_path = '../../models'
    os.makedirs(model_save_path, exist_ok=True)

    # Save the model locally
    model_filename = os.path.join(model_save_path, 'asd.joblib')
    joblib.dump(clf, model_filename)

Training classifiers:   0%|          | 0/1 [00:00<?, ?it/s]

Training Decision Tree...


Training classifiers: 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]

Finished training Decision Tree. Predicting...
Results for Decision Tree:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

 113786128087       1.00      1.00      1.00         1
 185490581724       1.00      1.00      1.00         1
 250499494341       1.00      1.00      1.00         1
 280058063756       1.00      1.00      1.00         1
 286714244682       1.00      1.00      1.00         2
 328221939283       1.00      1.00      1.00         1
 537697808478       1.00      1.00      1.00         1
 743662015790       1.00      1.00      1.00         1
 826199067422       1.00      1.00      1.00         1
1049262191674       1.00      1.00      1.00         1
1306058720853       1.00      1.00      1.00         2
1336506238357       1.00      1.00      1.00         3
1533904580570       1.00      1.00      1.00         1
1580256897801       1.00      1.00      1.00         1
1770545475005       1.00      1.00      1.00         1
17833187




In [6]:
# Best hyperparameters
best_params = {
    'max_depth': 5,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42
}


In [7]:
# Initialize and train the DecisionTreeClassifier with the best parameters
best_dt = DecisionTreeClassifier(**best_params)
best_dt.fit(X_train, y_train)



In [10]:
    y_pred = best_dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print (accuracy)
print (report)

0.024193548387096774
               precision    recall  f1-score   support

   5413450640       0.00      0.00      0.00         0
  19018368446       0.00      0.00      0.00         0
 113786128087       0.00      0.00      0.00         1
 185490581724       0.00      0.00      0.00         1
 250499494341       0.00      0.00      0.00         1
 280058063756       0.00      0.00      0.00         1
 286714244682       0.00      0.00      0.00         2
 328221939283       0.00      0.00      0.00         1
 537697808478       0.00      0.00      0.00         1
 743662015790       0.00      0.00      0.00         1
 826199067422       0.00      0.00      0.00         1
1049262191674       0.00      0.00      0.00         1
1306058720853       0.00      0.00      0.00         2
1336506238357       0.00      0.00      0.00         3
1533904580570       0.00      0.00      0.00         1
1580256897801       0.00      0.00      0.00         1
1770545475005       0.00      0.00      0.0

In [8]:
# Evaluate the model on test set
accuracy = accuracy_score(y_test, y_pred)
correct_predictions = sum(y_test == y_pred)
incorrect_predictions = sum(y_test != y_pred)

In [9]:
print(accuracy)
print(correct_predictions)
print(incorrect_predictions)

0.024193548387096774
3
121
