**PROBLEM STATEMENT:**
Perform the following operations using Python on any open source dataset (e.g., data.csv)
1. Import all the required Python Libraries.
2. Locate an open source data from the web (e.g., https://www.kaggle.com). Provide a clear description of
the data and its source (i.e., URL of the web site).
3. Load the Dataset into pandas dataframe.
4. Data Preprocessing: check for missing values in the data using pandas isnull(), describe() function to get
some initial statistics. Provide variable descriptions. Types of variables etc. Check the dimensions of the
data frame.
5. Data Formatting and Data Normalization: Summarize the types of variables by checking the data types
(i.e., character, numeric, integer, factor, and logical) of the variables in the data set. If variables are not in
the correct data type, apply proper type conversions.
6. Turn categorical variables into quantitative variables in Python.

In [134]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

read the iris database

In [180]:
df=pd.read_csv("iris.csv")

In [136]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [137]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [138]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [139]:
df.shape

(150, 5)

In [140]:
df.size

750

In [141]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [143]:
df.columns.values

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'], dtype=object)

In [144]:
df.columns.values.tolist()

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [145]:
df.iloc[:,1:2]

Unnamed: 0,sepal_width
0,3.5
1,3.0
2,3.2
3,3.1
4,3.6
...,...
145,3.0
146,2.5
147,3.0
148,3.4


In [146]:
df.iloc[:8,:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa


In [147]:
df.iloc[1:3,1:3]

Unnamed: 0,sepal_width,petal_length
1,3.0,1.4
2,3.2,1.3


In [148]:
df.iloc[1:10,1:4]

Unnamed: 0,sepal_width,petal_length,petal_width
1,3.0,1.4,0.2
2,3.2,1.3,0.2
3,3.1,1.5,0.2
4,3.6,1.4,0.2
5,3.9,1.7,0.4
6,3.4,1.4,0.3
7,3.4,1.5,0.2
8,2.9,1.4,0.2
9,3.1,1.5,0.1


In [149]:
df.loc[:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [150]:
df.iloc[5:6,3:5]

Unnamed: 0,petal_width,species
5,0.4,setosa


In [151]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [152]:
df.isnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False
148,False,False,False,False,False


In [153]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [154]:
df.notnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
...,...,...,...,...,...
145,True,True,True,True,True
146,True,True,True,True,True
147,True,True,True,True,True
148,True,True,True,True,True


In [155]:
df.notnull().sum()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

In [156]:
df.fillna(0)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [157]:
df.mean()

  df.mean()


sepal_length    5.843333
sepal_width     3.054000
petal_length    3.758667
petal_width     1.198667
dtype: float64

In [158]:
df.mode()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.0,3.0,1.5,0.2,setosa
1,,,,,versicolor
2,,,,,virginica


In [159]:
df.median()

  df.median()


sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

In [160]:
df.rename(columns={"sepal_length":'sp'})

Unnamed: 0,sp,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [161]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [162]:
df['sepal_length']=df['sepal_length'].astype('int')

In [163]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5,3.5,1.4,0.2,setosa
1,4,3.0,1.4,0.2,setosa
2,4,3.2,1.3,0.2,setosa
3,4,3.1,1.5,0.2,setosa
4,5,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6,3.0,5.2,2.3,virginica
146,6,2.5,5.0,1.9,virginica
147,6,3.0,5.2,2.0,virginica
148,6,3.4,5.4,2.3,virginica


In [164]:
df.dtypes

sepal_length      int32
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [165]:
df['species']=df['species'].str.upper()

In [166]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5,3.5,1.4,0.2,SETOSA
1,4,3.0,1.4,0.2,SETOSA
2,4,3.2,1.3,0.2,SETOSA
3,4,3.1,1.5,0.2,SETOSA
4,5,3.6,1.4,0.2,SETOSA
...,...,...,...,...,...
145,6,3.0,5.2,2.3,VIRGINICA
146,6,2.5,5.0,1.9,VIRGINICA
147,6,3.0,5.2,2.0,VIRGINICA
148,6,3.4,5.4,2.3,VIRGINICA


In [167]:
df.drop_duplicates(inplace=True)

In [168]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5,3.5,1.4,0.2,SETOSA
1,4,3.0,1.4,0.2,SETOSA
2,4,3.2,1.3,0.2,SETOSA
3,4,3.1,1.5,0.2,SETOSA
4,5,3.6,1.4,0.2,SETOSA
...,...,...,...,...,...
145,6,3.0,5.2,2.3,VIRGINICA
146,6,2.5,5.0,1.9,VIRGINICA
147,6,3.0,5.2,2.0,VIRGINICA
148,6,3.4,5.4,2.3,VIRGINICA


In [169]:
Scaler=MinMaxScaler()

In [170]:
df[['sepal_length','sepal_width']]=Scaler.fit_transform(df[['sepal_length','sepal_width']])

In [171]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.333333,0.625000,1.4,0.2,SETOSA
1,0.000000,0.416667,1.4,0.2,SETOSA
2,0.000000,0.500000,1.3,0.2,SETOSA
3,0.000000,0.458333,1.5,0.2,SETOSA
4,0.333333,0.666667,1.4,0.2,SETOSA
...,...,...,...,...,...
145,0.666667,0.416667,5.2,2.3,VIRGINICA
146,0.666667,0.208333,5.0,1.9,VIRGINICA
147,0.666667,0.416667,5.2,2.0,VIRGINICA
148,0.666667,0.583333,5.4,2.3,VIRGINICA


In [172]:
Scaler=StandardScaler()

In [173]:
df[['petal_length','petal_width']]=Scaler.fit_transform(df[['petal_length','petal_width']])

In [174]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.333333,0.625000,-1.384549,-1.363246,SETOSA
1,0.000000,0.416667,-1.384549,-1.363246,SETOSA
2,0.000000,0.500000,-1.441736,-1.363246,SETOSA
3,0.000000,0.458333,-1.327363,-1.363246,SETOSA
4,0.333333,0.666667,-1.384549,-1.363246,SETOSA
...,...,...,...,...,...
145,0.666667,0.416667,0.788525,1.423046,VIRGINICA
146,0.666667,0.208333,0.674152,0.892323,VIRGINICA
147,0.666667,0.416667,0.788525,1.025004,VIRGINICA
148,0.666667,0.583333,0.902897,1.423046,VIRGINICA


In [175]:
df['species'].replace(['SETOSA', 'VERSICOLOR ','VIRGINICA'],
                        [0, 1, 2])

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 142, dtype: object

In [176]:
label_encoder = LabelEncoder()

df['species_label'] = label_encoder.fit_transform(df['species'])

In [177]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_label
0,0.333333,0.625000,-1.384549,-1.363246,SETOSA,0
1,0.000000,0.416667,-1.384549,-1.363246,SETOSA,0
2,0.000000,0.500000,-1.441736,-1.363246,SETOSA,0
3,0.000000,0.458333,-1.327363,-1.363246,SETOSA,0
4,0.333333,0.666667,-1.384549,-1.363246,SETOSA,0
...,...,...,...,...,...,...
145,0.666667,0.416667,0.788525,1.423046,VIRGINICA,2
146,0.666667,0.208333,0.674152,0.892323,VIRGINICA,2
147,0.666667,0.416667,0.788525,1.025004,VIRGINICA,2
148,0.666667,0.583333,0.902897,1.423046,VIRGINICA,2


In [178]:
import pandas as pd

# Sample data
data = {'species': ['SETOSA', 'VERSICOLOR ','VIRGINICA']}

# Create a DataFrame
df = pd.DataFrame(data)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df['species'])

print(df_encoded)

   SETOSA  VERSICOLOR   VIRGINICA
0       1            0          0
1       0            1          0
2       0            0          1
