### Import required libraries

In [None]:
import pickle
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from PyQt5.QtWidgets import QFileDialog, QApplication
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


app = QApplication([])

### File Paths and Hyperparameters

In [None]:
# Specify traing and test file paths
train_file = QFileDialog.getOpenFileName(caption = "Select Training File", filter="CSV files (*.csv)")[0]
test_file = QFileDialog.getOpenFileName(caption = "Select Test File", filter="CSV files (*.csv)")[0]
model_file = 'cf_model.pkl'
output_file = 'output.csv'

### Import training data

In [None]:
df = pd.read_csv(train_file)

In [None]:
df.head()

### Preprocessing

In [None]:
label_encoder = LabelEncoder()
df['Species_labels']= label_encoder.fit_transform(df['Species'])
with open('label_encoder_Species.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

### Select columns for training

In [None]:
x_train = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y_train = df['Species_labels']

### Train the model

In [None]:
model = RandomForestClassifier(**{'n_estimators': 100, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'bootstrap': True, 'random_state': 42, 'class_weight': None})
model.fit(x_train, y_train)

In [None]:
df['prediction'] = model.predict(x_train)
df["prediction"] = df["prediction"].astype("category")

In [None]:
df.head()

### Plots

In [None]:
pca = PCA(n_components=2).fit_transform(x_train)
fig = px.scatter(x=pca[:,0], y=pca[:,1], color=df['prediction'])
fig.show()

In [None]:
anm = df['prediction'].value_counts()
fig2 = px.bar(x=anm.index, y=list(anm), labels={'x': 'Predicted Label', 'y': 'Number of data points'})
fig2.show()

In [None]:
class_label = y_train.unique()
conf_mx = confusion_matrix(y_train, df['prediction'], labels=class_label)
df_cm = pd.DataFrame(conf_mx, index=class_label,columns=class_label)
sns.heatmap(df_cm, annot=True,fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

### Predict on test dataset

In [None]:
df_test = pd.read_csv(test_file)

In [None]:
with open('label_encoder_Species.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
df_test['Species_labels']= label_encoder.transform(df_test['Species'])

In [None]:
x_test = df_test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y_test = df_test['Species_labels']

In [None]:
df_test['prediction'] = model.predict(x_test)
df_test["prediction"] = df_test["prediction"].astype("category")

In [None]:
df_test.head()

### Plots

In [None]:
pca_test = PCA(n_components=2).fit_transform(x_test)
fig3 = px.scatter(x=pca_test[:,0], y=pca_test[:,1], color=df_test['prediction'])
fig3.show()

In [None]:
anm_test = df_test['prediction'].value_counts()
fig4 = px.bar(x=anm_test.index, y=list(anm_test), labels={'x': 'Predicted Label', 'y': 'Number of data points'})
fig4.show()

In [None]:
class_label = y_train.unique()
conf_mx = confusion_matrix(y_test, df_test['prediction'], labels=class_label)
df_cm = pd.DataFrame(conf_mx, index=class_label,columns=class_label)
sns.heatmap(df_cm, annot=True,fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

### Save output and model

In [None]:
df_test.to_csv(output_file)

In [None]:
with open(model_file, 'wb') as f:
    pickle.dump(model, f)