In [6]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier #random forest model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression #logistic regression model


In [7]:
df=pd.read_csv('Iris.csv')
print(df.head())
print(df.tail())
print(df.info()) #shows the data types and non-null counts of each column

    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0  1.0            5.1           3.5            1.4           0.2  Iris-setosa
1  2.0            4.9           3.0            1.4           0.2  Iris-setosa
2  3.0            4.7           3.2            NaN           0.2  Iris-setosa
3  4.0            4.6           3.1            1.5           0.2  Iris-setosa
4  5.0            5.0           3.6            1.4           0.2  Iris-setosa
        Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
145  146.0            6.7           3.0            5.2           2.3   
146  147.0            6.3           2.5            5.0           1.9   
147  148.0            6.5           3.0            5.2           2.0   
148  149.0            6.2           3.4            5.4           2.3   
149  150.0            5.9           3.0            5.1           1.8   

            Species  
145  Iris-virginica  
146  Iris-virginica  
147  Iris-virginica  
148  Iris-v

In [8]:
print(df.describe()) #shows statistical summary of numerical columns

               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  144.000000     143.000000    143.000000     144.000000    147.000000
mean    77.263889       5.871329      3.037762       3.810417      1.193878
std     43.280026       0.835702      0.423899       1.749474      0.770094
min      1.000000       4.300000      2.000000       1.100000      0.100000
25%     40.750000       5.100000      2.800000       1.600000      0.300000
50%     78.500000       5.800000      3.000000       4.400000      1.300000
75%    114.250000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [9]:
print(df.shape)
print(df.columns)
print(df.dtypes)

(150, 6)
Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
Id               float64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


In [10]:
print(df.isnull().sum())

Id               6
SepalLengthCm    7
SepalWidthCm     7
PetalLengthCm    6
PetalWidthCm     3
Species          0
dtype: int64


In [11]:
# Fill NaNs with the mean of the column(SepalLengthCm)
df['SepalLengthCm'] = df['SepalLengthCm'].fillna(df['SepalLengthCm'].mean())
# Fill NaNs with the mean of the column(PetalLengthCm)
df['PetalLengthCm'] = df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean())

# Fill NaNs with the median of the column(SepalWidthCm)
df['SepalWidthCm'] = df['SepalWidthCm'].fillna(df['SepalWidthCm'].median())
# Fill NaNs with the median of the column(PetalWidthCm)
df['PetalWidthCm'] = df['PetalWidthCm'].fillna(df['PetalWidthCm'].median())

# Fill NaNs with the mode of the column(id)
df['Id'] = df['Id'].fillna(df['Id'].mode()[0])


In [12]:
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [13]:
print(df["Species"].unique())

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [14]:
le = LabelEncoder()
y = le.fit_transform(df["Species"])

#Iris-setosa → 0 Iris-versicolor → 1 Iris-virginica → 2


In [15]:
df = df.drop("Id", axis=1)
print(df.head())
x=df.drop("Species", axis=1)

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0            5.1           3.5       1.400000           0.2  Iris-setosa
1            4.9           3.0       1.400000           0.2  Iris-setosa
2            4.7           3.2       3.810417           0.2  Iris-setosa
3            4.6           3.1       1.500000           0.2  Iris-setosa
4            5.0           3.6       1.400000           0.2  Iris-setosa


In [16]:
#data splitting
X_train, X_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,      # 80% for training, 20% for testing
    random_state=42,    # reproducibility
    stratify=y          # IMPORTANT for classification
)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(120, 4) (30, 4)
(120,) (30,)


In [17]:
#random forest model
rf = RandomForestClassifier(
    n_estimators=100,    # number of trees
    random_state=42      # reproducibility
)

rf.fit(X_train, y_train) #train the model
y_pred = rf.predict(X_test) #test the model      

accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print(cm)

feature_importance = pd.Series(
    rf.feature_importances_,
    index=x.columns
).sort_values(ascending=False)

print(feature_importance)

Random Forest Accuracy: 0.9666666666666667
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]
PetalWidthCm     0.483349
PetalLengthCm    0.382708
SepalLengthCm    0.109175
SepalWidthCm     0.024769
dtype: float64


In [18]:
#logistic regression model

reg = LogisticRegression(
    max_iter=10000, #ensures convergence
    random_state=0
)
reg.fit(X_train, y_train) #train the model

y_pred = reg.predict(X_test) #test the model

print("Logistic Regression Accuracy:",
      accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.9666666666666667


In [19]:
import pickle
#dumping the model to a pkl file

#saving the label encoder
with open("label_encoder.pkl", "wb") as file:
    pickle.dump(le, file)

#saved the trained random forest model
with open("random_forest_model.pkl", "wb") as file:
    pickle.dump(rf, file)

#saved the trained logistic regression model
with open("logistic_regression_model.pkl", "wb") as file:
    pickle.dump(reg, file)

In [20]:
#unloading the label encoder from the pkl file
with open("label_encoder.pkl", "rb") as file:
    le_loaded = pickle.load(file)

#unloading the random forest model from the pkl file
with open("random_forest_model.pkl", "rb") as file:
    rf_loaded = pickle.load(file)

#unloading the logistic regression model from the pkl file
with open("logistic_regression_model.pkl", "rb") as file:
    reg_loaded = pickle.load(file)

#testing the loaded random forest model
y_pred_loaded = rf_loaded.predict(X_test)
print("Loaded Model Accuracy(RFM):", accuracy_score(y_test, y_pred_loaded))

#testing the loaded logistic regression model
y_pred_loaded_reg = reg_loaded.predict(X_test)
print("Loaded Model Accuracy(LRM):", accuracy_score(y_test, y_pred_loaded_reg)) 

Loaded Model Accuracy(RFM): 0.9666666666666667
Loaded Model Accuracy(LRM): 0.9666666666666667


In [None]:
#getting user input for predictions
sepal_length = float(input("Enter sepal length: "))
sepal_width  = float(input("Enter sepal width: "))
petal_length = float(input("Enter petal length: "))
petal_width  = float(input("Enter petal width: "))

user_input = [[
    sepal_length,
    sepal_width,
    petal_length,
    petal_width
]]

#prediction using loaded random forest model
pred_encoded = rf_loaded.predict(user_input)
pred_label = le.inverse_transform(pred_encoded)

print("\n--- User Input Summary ---")
print(f"Sepal Length : {sepal_length}")
print(f"Sepal Width  : {sepal_width}")
print(f"Petal Length : {petal_length}")
print(f"Petal Width  : {petal_width}")
print("--------------------------\n")
print("Random Forest Model Prediction:", pred_label)



--- User Input Summary ---
Sepal Length : 4.0
Sepal Width  : 4.0
Petal Length : 4.0
Petal Width  : 4.0
--------------------------

Random Forest Model Prediction: ['Iris-virginica']




In [22]:
print(X_train.columns)



Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')
