Make model

Overview with AI studio Auto-model
- Using Cleaned data to make a overview

![alt text](ImageResource/AI-overview.png)

From overview, select:
- general linear model
- Decision Tree
- Gradient Boost Trees

Generalize Linear model

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load and encode data
original_data = pd.read_csv('CleanedData.csv')
encode = original_data.apply(LabelEncoder().fit_transform)
data = encode.sample(100)  # Random sampling

# Split features and target
X = data.drop('class', axis=1)
Y = data['class']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

# Define pipelines
pipeline_RFC = Pipeline([
    ('scaler', StandardScaler()),  # Scaling features
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline_DT = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

pipeline_LR = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegressionCV(cv=5, random_state=42, max_iter=1000))
])

# Train models
pipeline_RFC.fit(X_train, y_train)
pipeline_DT.fit(X_train, y_train)
pipeline_LR.fit(X_train, y_train)

# Make predictions
preds_RFC = pipeline_RFC.predict(X_test)
preds_DT = pipeline_DT.predict(X_test)
preds_LR = pipeline_LR.predict(X_test)

# Evaluate models using classification metrics
def evaluate_model(y_test, preds, model_name):
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_test, preds)}")
    print(f"Precision: {precision_score(y_test, preds, average='weighted')}")
    print(f"Recall: {recall_score(y_test, preds, average='weighted')}")
    print(f"F1 Score: {f1_score(y_test, preds, average='weighted')}")
    print(classification_report(y_test, preds))

# Evaluate each model
evaluate_model(y_test, preds_DT, "Decision Tree")
evaluate_model(y_test, preds_RFC, "Random Forest")
evaluate_model(y_test, preds_LR, "Logistic Regression")


--- Decision Tree ---
Accuracy: 0.96
Precision: 0.9628571428571427
Recall: 0.96
F1 Score: 0.9598711755233496
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        24
           1       0.93      1.00      0.96        26

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50

--- Random Forest ---
Accuracy: 0.92
Precision: 0.9229487179487179
Recall: 0.92
F1 Score: 0.92
              precision    recall  f1-score   support

           0       0.88      0.96      0.92        24
           1       0.96      0.88      0.92        26

    accuracy                           0.92        50
   macro avg       0.92      0.92      0.92        50
weighted avg       0.92      0.92      0.92        50

--- Logistic Regression ---
Accuracy: 0.88
Precision: 0.8820779220779221
Recall: 0.88
F1 Score: 0.8796135265700482
              precision    

In case of data greater than 1000, rmse and r2 will nearly to be 0.00

In [66]:
import pickle

with open('model_LR.pkl', 'wb') as f:
    pickle.dump(pipeline_LR, f)

In [81]:
# Get column names
original_data = pd.read_csv('CleanedData.csv')
headers = original_data.head()


# Get number of columns (attributes)
for hd in headers:
    print(f"Header: {hd}, [{original_data[hd].unique()}, {encode[hd].unique()}]")

Header: class, [['poisonous' 'edible'], [1 0]]
Header: cap-shape, [['convex' 'bell' 'sunken' 'flat' 'knobbed' 'conical'], [2 0 5 3 4 1]]
Header: cap-surface, [['smooth' 'scaly' 'fibrous' 'grooves'], [3 2 0 1]]
Header: cap-color, [['brown' 'yellow' 'white' 'gray' 'red' 'pink' 'buff' 'purple' 'cinnamon'
 'green'], [0 9 8 3 7 5 1 6 2 4]]
Header: bruises, [['bruises' 'no'], [0 1]]
Header: odor, [['pungent' 'almond' 'anise' 'none' 'foul' 'creosote' 'fishy' 'spicy'
 'musty'], [7 0 1 6 4 2 3 8 5]]
Header: gill-spacing, [['close' 'crowded'], [0 1]]
Header: gill-size, [['narrow' 'broad'], [1 0]]
Header: gill-color, [['black' 'brown' 'gray' 'pink' 'white' 'chocolate' 'purple' 'red' 'buff'
 'green' 'yellow' 'orange'], [ 0  1  4  7 10  3  8  9  2  5 11  6]]
Header: stalk-shape, [['enlarging' 'tapering'], [0 1]]
Header: stalk-root, [['equal' 'club' 'bulbous' 'rooted' 'missing'], [2 1 0 4 3]]
Header: stalk-surface-above-ring, [['smooth' 'fibrous' 'silky' 'scaly'], [3 0 2 1]]
Header: stalk-surface-be