In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [3]:
df = pd.read_csv('iris.csv')

In [4]:
# Inspect first few rows of the dataset
print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [5]:
# Check for missing values
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [6]:
# Get initial statistics
print(df.describe())


               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [7]:
# Check dimensions of the dataframe
print(df.shape)  # This will give you the number of rows and columns

(150, 6)


In [10]:
# Step 5: Data Formatting and Data Normalization
# Check data types of variables
print("Variable Types:")
print(df.dtypes)

Variable Types:
Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


In [13]:
# Data Type Conversion
# Assuming 'species' is a categorical variable
# Convert 'species' from object to category data type
df['Species'] = df['Species'].astype('category')

In [16]:
# Step 6: Turning Categorical Variables into Quantitative Variables
# Using Label Encoding for 'species' column
label_encoder = LabelEncoder()
df['species_code'] = label_encoder.fit_transform(df['species'])


In [17]:
print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4           0.2  Iris-setosa   

       species  species_code  
0  Iris-setosa             0  
1  Iris-setosa             0  
2  Iris-setosa             0  
3  Iris-setosa             0  
4  Iris-setosa             0  
