In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df[['species']].value_counts()

species        
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [5]:
ohencoder = OneHotEncoder()
ohe = ohencoder.fit_transform(df[['species']]).toarray()
encoded_df = pd.DataFrame(ohe,columns=ohencoder.get_feature_names_out())
df_ohe = pd.concat([df,encoded_df],axis = 1)
df_ohe.drop(columns=df_ohe[['species']],axis = 1,inplace = True)
df_ohe.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_Iris-setosa,species_Iris-versicolor,species_Iris-virginica
0,5.1,3.5,1.4,0.2,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,1.0,0.0,0.0


In [6]:
lencode = LabelEncoder()
le = lencode.fit_transform(df[['species']])
label_encoded_df = pd.DataFrame(le,columns = ['species_types'])
df_le = pd.concat([df,label_encoded_df],axis = 1)
df_le.drop(columns = df[['species']],axis = 1,inplace = True)
df_le

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_types
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [7]:
x = df_le.iloc[ : , :-1]
y = df_le.iloc[ : ,-1]

In [8]:
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: species_types, dtype: int32

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 42)


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [12]:
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy = 'median')),
        ('scaler',StandardScaler())
    ]
)
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,x.columns)
    ]
)

In [13]:
x_train_preprocessed = preprocessor.fit_transform(x_train)
x_test_preprocessed = preprocessor.transform(x_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [15]:
x_train_preprocessed.shape,x_test_preprocessed.shape

((105, 4), (45, 4))

In [16]:
parameters = {
    'n_estimators':[10,20,30,40,50],
    'criterion' : ['gini','entropy'],
    'min_samples_leaf':[1,2,4]
}

In [17]:
rcv_model = RandomizedSearchCV(
    RandomForestClassifier(),
    parameters,
    n_iter = 5,
    cv = 5,
    n_jobs = -1,
    verbose = 2
)

In [18]:
rcv_model.fit(x_train_preprocessed,y_train)
rcv_model.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


{'n_estimators': 30, 'min_samples_leaf': 4, 'criterion': 'gini'}

In [19]:
y_pred = rcv_model.predict(x_test_preprocessed)

In [20]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [21]:
confusion_matrix(y_test,y_pred)

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]], dtype=int64)

In [22]:
accuracy_score(y_test,y_pred)

1.0

In [23]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        19\n           1       1.00      1.00      1.00        13\n           2       1.00      1.00      1.00        13\n\n    accuracy                           1.00        45\n   macro avg       1.00      1.00      1.00        45\nweighted avg       1.00      1.00      1.00        45\n'

In [24]:
import pickle
pickle.dump(rcv_model,open('C:/Users/Rager/Desktop/Programming/IRIS Project/models/random_forest_model.pkl','wb'))
pickle.dump(preprocessor,open('C:/Users/Rager/Desktop/Programming/IRIS Project/models/preprocessor.pkl','wb'))

In [25]:
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [26]:
y_test

73     1
18     0
118    2
78     1
76     1
31     0
64     1
141    2
68     1
82     1
110    2
12     0
36     0
9      0
19     0
56     1
104    2
69     1
55     1
132    2
29     0
127    2
26     0
128    2
131    2
145    2
108    2
143    2
45     0
30     0
22     0
15     0
65     1
11     0
42     0
146    2
51     1
27     0
4      0
32     0
142    2
85     1
86     1
16     0
10     0
Name: species_types, dtype: int32

In [27]:
rcv_model.predict_proba(x_test_preprocessed)

array([[0.        , 0.95361953, 0.04638047],
       [0.93518519, 0.06481481, 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.90534392, 0.09465608],
       [0.        , 0.73217773, 0.26782227],
       [0.99444444, 0.00555556, 0.        ],
       [0.00740741, 0.99259259, 0.        ],
       [0.        , 0.03596561, 0.96403439],
       [0.        , 0.86605099, 0.13394901],
       [0.00740741, 0.97744108, 0.01515152],
       [0.        , 0.03596561, 0.96403439],
       [1.        , 0.        , 0.        ],
       [0.93518519, 0.06481481, 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.02777778, 0.79826239, 0.17395984],
       [0.        , 0.        , 1.        ],
       [0.00740741, 0.98653199, 0.00606061],
       [0.        , 0.96018519, 0.03981481],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.2033646 , 0.7966354 ],
       [1.

In [36]:
result = rcv_model.predict([[5.1,3.5,1.4,0.2]])


ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype