In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = ':https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F19%2F420%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240906%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240906T045310Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1688d0edabbc77eebda93d407717dff860c4798837052b92b2ec3edee7e9fd582a1298d4cc5e00086aa0a098b6cf18e873ba04a3ef9340156a554ab6e7036c1fdba421634e3f481029f6874b3e308f3caa8e0643cd678b76e401b1fcef00237a3febb84115b1a8c8cb86c976630c5bbabee17ac90fca313418af6b9651f9965c40d632e6229f6194734520b5cbeaa9aadd08ab3b32f396488d64dd2a21ce7d9235b4a2a09dfec68608e63e13c4336278cb572a344fc6cfb30f11300d8950bf438379500c2da202757762327ba22a2c75242a8143d3837f5b28558ae533535e4ce309ad55e48b25a1eceb21aeade9b59d8059b6124ad34db7d9ce3b51d9503489'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


**Loading the Iris dataset from Scikit-learn**

In [None]:
# import load_iris function from datasets module
from sklearn.datasets import load_iris

**Data as table**

A basic table is a two-dimensional grid of data, in which the rows represent individual elements of the dataset, and the columns represent quantities related to each of these elements. In general, we will refer to the rows of the matrix as samples, and the number of rows as n_samples and the the columns of the matrix as features, and the number of columns as n_features.

**Features matrix** - This table layout makes clear that the information can be thought of as a two-dimensional numerical array or matrix,  called  the features matrix with shape [n_samples, n_features]

**Target array.**- In addition to the feature matrix X, we also generally work with a label or target array, which by convention we will usually call y. The target array is usually one dimensional, with length n_samples, and is generally contained in a NumPy array or Pandas Series.

In [None]:
# save "bunch" object containing iris dataset and iits attributes
iris = load_iris()
type(iris)

In [None]:
#print the iris dataset
# Each row represents the flowers and each column represents the length and width.
print (iris.data)
iris.data.shape

**Machine Learning Terminology**

1.  Each row is  an **observation** (also known as : sample, example, instance, record)

2. Each column is a **feature** (also known as: Predictor, attribute, Independent Variable, input, regressor, Covariate)

In [None]:
# print the names of the four features
print (iris.feature_names)

In [None]:
# print the integers representing the species of each observation
print (iris.target)

In [None]:
# print the encoding scheme for species; 0 = Setosa , 1=Versicolor, 2= virginica
print (iris.target_names)

Each value we are predicting is the **response** (also known as: target, outcome, label, dependent variable)

**Classification** is supervised learning in which the response is categorical

**Regression** is supervised learning in which the response is ordered and continuous

**Requirements for working with data in scikit-learn**

1) Features  and response are **separate objects**

2) Features and response should be **numeric**

3)Features and response should be **NumPy arrays**

4)Features and response should have **specific shapes**

In [None]:
# Check the types of the features and response
type('iris.data')
type('iris.target')

In [None]:
# Check the shape of the features
#(first dimension = (ROWS) ie number of observations, second dimensions = (COLUMNS) ie number of features)
iris.data.shape

In [None]:
# Check the sape of the response (single dimension matching the number of observation)
iris.target.shape

**1. Scatter Plot with Iris Dataset **

In [None]:
# Extract the values for features and create a list called featuresAll
featuresAll=[]
features = iris.data[: , [0,1,2,3]]
features.shape

In [None]:
# Extract the values for targets
targets = iris.target
targets.reshape(targets.shape[0],-1)
targets.shape

In [None]:
# Every observation gets appended into the list once it is read. For loop is used for iteration process
for observation in features:
    featuresAll.append([observation[0] + observation[1] + observation[2] + observation[3]])
print (featuresAll)


In [None]:
# Plotting the Scatter plot
import matplotlib.pyplot as plt
plt.scatter(featuresAll, targets, color='red', alpha =1.0)
plt.rcParams['figure.figsize'] = [10,8]
plt.title('Iris Dataset scatter Plot')
plt.xlabel('Features')
plt.ylabel('Targets')


**1a) Scatter Plot with Iris Dataset (Relationship between Sepal Length and Sepal Width) # Method 1**

In [None]:
#Finding the relationship between Sepal Length and Sepal width
featuresAll = []
targets = []
for feature in features:
    featuresAll.append(feature[0]) #Sepal length
    targets.append(feature[1]) #sepal width

groups = ('Iris-setosa','Iris-versicolor','Iris-virginica')
colors = ('blue', 'green','red')
data = ((featuresAll[:50], targets[:50]), (featuresAll[50:100], targets[50:100]),
        (featuresAll[100:150], targets[100:150]))

for item, color, group in zip(data,colors,groups):
    #item = (featuresAll[:50], targets[:50]), (featuresAll[50:100], targets[50:100]), (featuresAll[100:150], targets[100:150])
    x, y = item
    plt.scatter(x, y,color=color,alpha=1)
    plt.title('Iris Dataset scatter Plot')
plt.xlabel('sepal length')
plt.ylabel('Sepal width')
plt.show()


**1b) Scatter Plot with Iris Dataset (Relationship between Petal Length and Petal Width) # Method 1  **

In [None]:
#Finding the relationship between Petal Length and Petal width
featuresAll = []
targets = []
for feature in features:
    featuresAll.append(feature[2]) #Petal length
    targets.append(feature[3]) #Petal width

groups = ('Iris-setosa','Iris-versicolor','Iris-virginica')
colors = ('blue', 'green','red')
data = ((featuresAll[:50], targets[:50]), (featuresAll[50:100], targets[50:100]),
        (featuresAll[100:150], targets[100:150]))

for item, color, group in zip(data,colors,groups):
    #item = (featuresAll[:50], targets[:50]), (featuresAll[50:100], targets[50:100]), (featuresAll[100:150], targets[100:150])
    x0, y0 = item
    plt.scatter(x0, y0,color=color,alpha=1)
    plt.title('Iris Dataset scatter Plot')
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.show()

  **2. K - Nearest Neighbours (KNN) Algorithm**

**sklearn.neighbors** provides functionality for unsupervised and supervised neighbors-based learning methods. **Supervised neighbors-based learning** comes in two flavors: classification for data with discrete labels, and regression for data with continuous labels. **Unsupervised nearest neighbors** is the foundation of many other learning methods, notably manifold learningand spectral clustering.

Despite its simplicity, nearest neighbors has been successful in a large number of classification and regression problems, including handwritten digits or satellite image scenes. Being a non-parametric method, it is often successful in classification situations where the decision boundary is very irregular.

In [None]:
import pandas as pd
iris = load_iris()
ir = pd.DataFrame(iris.data)
ir.columns = iris.feature_names
ir['CLASS'] = iris.target
ir.head()

The classes in **sklearn.neighbors** can handle either Numpy arrays or scipy.sparse matrices as input. For dense matrices, a large number of possible distance metrics are supported.

In [None]:
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(5) #The arguements specify to return the Fast 5 most among the dataset
nn.fit(iris.data)

In [None]:
ir.describe()

In [None]:
#creating a test data
import numpy as np
test = np.array([5.4,2,2,2.3])
test1 = test.reshape(1,-1)
test1.shape

In [None]:
nn.kneighbors(test1,5)

In [None]:
ir.ix[[98, 93, 57, 60, 79],]

**3. KNeighborsClassifier Algorithm**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 15

# we only take the first two features. We could avoid this ugly
# slicing by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.show()

**KNN Classifiers Algorithm - How it works? - With Easy explanation**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
print (knn)

In [None]:
import numpy as np
X1 = np.asarray(featuresAll)
X1 = X1.reshape(-1,1)

In [None]:
X1.shape

In [None]:
y = iris.target

y.shape

In [None]:
knn.fit(X1, y)

In [None]:
import numpy as np
print (knn.predict([[6.4]]))

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(X1, y)

In [None]:
print (knn.predict([[3.4]]))

In [None]:
print (knn.predict(np.column_stack([[1.,6.1,3.2,4.2]])))

**Linear regression**

We will start with the most familiar linear regression, a straight-line fit to data. A straight-line fit is a model of the form
y=ax+b
where a is commonly known as the slope, and b is commonly known as the intercept.

We can use Scikit-Learn's LinearRegression estimator to fit this data and construct the best-fit line:

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression(fit_intercept=True)
model

In [None]:
import numpy as np
XX = np.asarray(featuresAll)
X2 = XX[:, np.newaxis]
X2
X2.shape

In [None]:
y2 = iris.target
y2.shape


In [None]:
model.fit(X2, y2)

The slope and intercept of the data are contained in the model's fit parameters, which in Scikit-Learn are always marked by a trailing underscore. Here the relevant parameters are coef_ and intercept_:

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
Xfit = np.random.randint(8,size=(150))
Xfit.astype(float)
Xfit = Xfit[:, np.newaxis]
Xfit.shape

In [None]:
yfit = (model.predict(Xfit))
yfit.shape

In [None]:
plt.scatter(X2, y2)
plt.plot(Xfit, yfit)

**Regression**

In statistical modeling, regression analysis is a set of statistical processes for estimating the relationships among variables. It includes many techniques for modeling and analyzing several variables, when the focus is on the relationship between a dependent variable and one or more independent variables (or 'predictors'). More specifically, regression analysis helps one understand how the typical value of the dependent variable (or 'criterion variable') changes when any one of the independent variables is varied, while the other independent variables are held fixed.

One trick you can use to adapt linear regression to nonlinear relationships between variables is to transform the data according to basis functions. We have seen one version of this before, in the PolynomialRegression pipeline used in Hyperparameters and Model Validation and Feature Engineering. The idea is to take our multidimensional linear model:
y=a0+a1x1+a2x2+a3x3+⋯
and build the x_1, x_2, x_3, and so on, from our single-dimensional input x.

This polynomial projection is useful enough that it is built into Scikit-Learn, using the PolynomialFeatures transformer:

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(150, include_bias=False)
poly.fit_transform(X2)

In [None]:
from sklearn.pipeline import make_pipeline
poly_model = make_pipeline(PolynomialFeatures(3),
                           LinearRegression())
poly_model.fit(X2, y2)
yfit = poly_model.predict(Xfit)

In [None]:
#Our linear model, through the use of 3rd-order polynomial basis functions, can provide a fit to this non-linear data
plt.scatter(X2, y2)
plt.plot(Xfit, yfit);

**How the length and width vary according to the species**

In [None]:
import pandas as pd
iris1 = pd.read_csv("../input/Iris.csv") #load the dataset
iris1.head(5)

**1c) Scatter Plot with Iris Dataset (Relationship between Sepal Length and SepalWidth) # Method 1  **

In [None]:
iris1.plot(kind ='scatter', x ='SepalLengthCm', y ='SepalWidthCm')
plt.show()

**1d) Scatter Plot with Iris Dataset (Relationship between Petal Length and Petal Width) Method 1  **

In [None]:
iris1.plot(kind ='scatter', x ='PetalLengthCm', y ='PetalWidthCm')
plt.show()

**Histograpm Plot of Iris Data **

In [None]:
exclude = ['Id']
iris1.ix[:, iris1.columns.difference(exclude)].hist()
plt.figure(figsize=(15,10))
plt.show()

**Violin Plot**

In [None]:
import seaborn as sns
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='PetalLengthCm',data=iris1)
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='PetalWidthCm',data=iris1)
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='SepalLengthCm',data=iris1)
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='SepalWidthCm',data=iris1)

Now, when we train any algorithm, the number of features and their correlation plays an important role. If there are features and many of the features are highly correlated, then training an algorithm with all the featues will reduce the accuracy. Thus features selection should be done carefully. This dataset has less featues but still we will see the correlation.

**IRIS Correlation Matrix**

In [None]:
corr = iris1.corr()
corr

In [None]:
# import correlation matrix to see parametrs which best correlate each other
# According to the correlation matrix results Petal LengthCm and
#PetalWidthCm have positive correlation which is proved by the scatter plot discussed above

import seaborn as sns
import pandas as pd
corr = iris1.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap='viridis', annot=True)
plt.show()

**Supervised learning example: Iris classification**

In [None]:
# I prefer to use train_test_split for cross-validation
# This piece will prove us if we have overfitting
X3 = iris1.iloc[:, 0:5]
Y3 = iris1['Species']

We would like to evaluate the model on data it has not seen before, and so we will split the data into a training set and a testing set. This could be done by hand, but it is more convenient to use the **train_test_split** utility function

In [None]:
from sklearn.cross_validation import train_test_split
X3_train, X3_test, y_train, y_test = train_test_split(X3, Y3, test_size=0.4, random_state=0)
print(" X3_train",X3_train)
print("X3_test",X3_test)
print("y_train",y_train)
print("y_test",y_test)

**With the data arranged, we can follow our recipe to predict the labels:**

In [None]:
#Train and test model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model = model.fit(X3_train ,y_train)
y_model = model.predict(X3_test)
y_model

Finally, we can use the **accuracy_score** utility to see the fraction of predicted labels that match their true value:                 

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_model)

With an accuracy topping 96%, we see that even this very naive classification algorithm is effective for this particular dataset!

** K Means Clustering in SciKit Learn with Iris Data**

k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.

The k-means algorithm searches for a pre-determined number of clusters within an unlabeled multidimensional dataset. It accomplishes this using a simple conception of what the optimal clustering looks like:

The "cluster center" is the arithmetic mean of all the points belonging to the cluster.
Each point is closer to its own cluster center than to other cluster centers. Those two assumptions are the basis of the k-means model.

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters=3, max_iter =1000)

In [None]:
X1.shape

In [None]:
km.fit(iris.data)

In [None]:
km.cluster_centers_

In [None]:
km.labels_

In [None]:
iris1[' K Mean predicted label'] = km.labels_
iris1

In [None]:
#First, let's generate a two-dimensional dataset containing four distinct blobs.
#To emphasize that this is an unsupervised algorithm, we will leave the labels out of the visualization.
from sklearn.datasets.samples_generator import make_blobs
X1, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
plt.scatter(X1[:, 0], X1[:, 1], s=50);

In [None]:
#By eye, it is relatively easy to pick out the four clusters.
#The k-means algorithm does this automatically, and in Scikit-Learn uses the typical estimator API:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(X1)
y_kmeans = kmeans.predict(X1)

In [None]:
#Let's visualize the results by plotting the data colored by these labels.
#We will also plot the cluster centers as determined by the k-means estimator:
plt.scatter(X1[:, 0], X1[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

**Unsupervised learning example: Iris dimensionality**

As an example of an unsupervised learning problem, let's take a look at reducing the dimensionality of the Iris data so as to more easily visualize it. Recall that the Iris data is four dimensional: there are four features recorded for each sample.

The task of dimensionality reduction is to ask whether there is a suitable lower-dimensional representation that retains the essential features of the data. Often dimensionality reduction is used as an aid to visualizing data: after all, it is much easier to plot data in two dimensions than in four dimensions or higher!

Principal component analysis- PCA which is a fast linear dimensionality reduction technique.

In [None]:
from sklearn.decomposition import PCA  # 1. Choose the model class
model = PCA(n_components=2)  # 2. Instantiate the model with hyperparameters

In [None]:
model.fit(X3)

In [None]:
X_2D = model.transform(X3) # 3. Fit to data. Notice y is not specified!
X_2D

In [None]:
X_2D.shape # 4. Transform the data to two dimensions

In [None]:
X_2D[:, 0]

In [None]:
X_2D[:, 1]

In [None]:
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)

**Pivot the Data with Iris Dataset**

In [None]:
import pandas as pd
iris1 = pd.read_csv("../input/Iris.csv") #load the dataset
iris1.head(10)

**The simplest pivot table must have a dataframe and an index . In this case, let’s use the Species as our index.**

In [None]:
pd.pivot_table(iris1,index=["Id"])

**You can have multiple indexes as well. In fact, most of the pivot_table args can take multiple values via a list.**

In [None]:
pd.pivot_table(iris1,index=["Id","Species"])

**This is interesting but not particularly useful. What we probably want to do is look at this by  Species and ID. It’s easy enough to do by changing the index .**

In [None]:
pd.pivot_table(iris1,index=["Species","Id"])

**You can see that the pivot table is smart enough to start aggregating the data and summarizing  Sepal Lenth and Petal length  with their Species name.**

In [None]:
pd.pivot_table(iris1,index=["Species"],values=["SepalLengthCm","SepalWidthCm"])

**The SepalLength and SepalWidth column automatically averages the data but we can do a count or a sum.**

In [None]:
pd.pivot_table(iris1,index=["Species"],values=["SepalLengthCm","SepalWidthCm"],aggfunc=np.sum)

**aggfunc can take a list of functions. Let’s try a mean using the numpy mean function and len to get a count.**

In [None]:
pd.pivot_table(iris1,index=["Species"],values=["SepalLengthCm","SepalWidthCm"],aggfunc=[np.mean,len])

In [None]:
pd.pivot_table(iris1,index=["Species"],values=["SepalLengthCm","SepalWidthCm"],
               columns=["PetalLengthCm"],aggfunc=[np.sum])

**The NaN’s are a bit distracting. If we want to remove them, we could use fill_value to set them to 0.**

In [None]:
pd.pivot_table(iris1,index=["Species"],values=["SepalLengthCm","SepalWidthCm"],
               columns=["PetalLengthCm"],aggfunc=[np.sum],fill_value=0)

**Add Sepal Width to the index list.**

In [None]:
pd.pivot_table(iris1,index=["Species","SepalLengthCm","SepalWidthCm","PetalWidthCm"],
               values=["PetalLengthCm"],aggfunc=[np.sum],fill_value=0)

For this data set, this representation makes more sense. Now, what if I want to see some totals? margins=True does that for us.

In [None]:
df = pd.pivot_table(iris1,index=["Species","SepalLengthCm","SepalWidthCm","PetalWidthCm"],
               values=["PetalLengthCm"],aggfunc=[np.sum,np.mean],fill_value=0,margins=True)
df

Suppose, If you want to look at just one Species:

In [None]:
df.query('Species == ["Iris-virginica"]')