In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('IRIS.csv')

# Initial inspection
print("Columns:", df.columns.tolist())
print("Unique species:", df['species'].unique())
print("Species count:\n", df['species'].value_counts())
print("Missing values per column:\n", df.isnull().sum())
print("Sample records:\n", df.sample(5))

# Handle missing values by replacing with column mean
for col in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    mean_val = df[col].mean()
    df[col].fillna(mean_val)
    print(f"Mean of '{col}': {mean_val}")

# Confirm no missing values
print("\nMissing values after imputation:\n", df.isnull().sum())

# Dataset summary
print("\nDataset description:\n", df.describe(include='all'))
print("Data types:\n", df.dtypes)
print(f"Dataset shape: {df.shape}")

# Variable types summary
print("\nSummary of Variables:")
for col in df.columns:
    dtype = df[col].dtype
    if dtype == 'object':
        print(f"{col}: Character (String)")
    elif dtype == 'int64':
        print(f"{col}: Integer")
    elif dtype == 'float64':
        
        print(f"{col}: Numeric")
    elif dtype == 'bool':
        print(f"{col}: Logical (Boolean)")
    else:
        print(f"{col}: Unknown")

# Label encoding for species column
df_encoded = df.copy()
le = LabelEncoder()
df_encoded['species'] = le.fit_transform(df['species'])+1
print("\nEncoded dataset sample:\n", df_encoded.sample(5))


Columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Unique species: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Species count:
 species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64
Missing values per column:
 sepal_length    11
sepal_width      2
petal_length    11
petal_width      0
species          0
dtype: int64
Sample records:
      sepal_length  sepal_width  petal_length  petal_width          species
90            5.5          2.6           4.4          1.2  Iris-versicolor
104           6.5          3.0           5.8          2.2   Iris-virginica
18            5.7          3.8           1.7          0.3      Iris-setosa
26            5.0          3.4           1.6          0.4      Iris-setosa
75            6.6          3.0           4.4          1.4  Iris-versicolor
Mean of 'sepal_length': 5.848201438848921
Mean of 'sepal_width': 3.048648648648649
Mean of 'petal_length': 3.7762589928057553
Mean 