# The Sparks Foundation - GRIP- Data Science and Business Analytics- Oct'21

## Task 2 : Prediction Using Unsupervised Learning

##  Author : Shubham Yadav

## Dataset used : Iris Dataset,  Dataset link: https://bit.ly/3kXTdox 

#### Importing required Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries imported successfully")

#### Gathering data from csv file

In [3]:
data = pd.read_csv("../input/iris-dataset/Iris.csv")
data.head(10)

In [5]:
data.shape

In [7]:
data.info()

#### Lets drop the column with name Id

In [8]:
data.drop('Id',axis=1,inplace=True)
print("column deleted successfully")

In [9]:
data.columns

#### check for nulls & duplicates

In [10]:
print(data.isnull().sum(), '\n\nNumber of duplicate rows: ' , data.duplicated().sum())

#### Drop Duplicate Rows

In [11]:
data.drop_duplicates(inplace=True)  
data.shape[0]

## now number of rows left 147, earlier there were 150 rows.

#### Check for any outliers in the numeric data

In [12]:
for i in data.columns:
    if data[i].dtype=='float64':
        plt.figure(figsize=(6,3))
        sns.boxplot(data[i])
        plt.show()

#### Treating outliers present in the SepalWidthCm column

In [13]:
q1,q3 = np.percentile(data['SepalWidthCm'],[25,75])
iqr = q3-q1
lower_fence = q1 - (1.5*iqr)
upper_fence = q3 + (1.5*iqr)
data['SepalWidthCm'] = data['SepalWidthCm'].apply(lambda x: upper_fence if x>upper_fence 
                                                  else lower_fence if x<lower_fence else x)

In [14]:
sns.boxplot(data['SepalWidthCm']);

#### understanding data

In [15]:
print(data.Species.value_counts())
sns.countplot(data.Species);

In [16]:
data.describe()

In [17]:
data.Species.unique()

#### Distributions of features by Species

In [18]:
for i in data.columns[:-1]:
    
    sns.kdeplot(data = data.loc[data.Species=='Iris-setosa'][i], label="Iris-setosa", shade=True)
    
    sns.kdeplot(data = data.loc[data.Species=='Iris-versicolor'][i], label="Iris-versicolor", shade=True)

    sns.kdeplot(data = data.loc[data.Species=='Iris-virginica'][i], label="Iris-virginica", shade=True)

    plt.title(i);
    
    plt.show()

#### Correlation Matrix

In [19]:
data.corr()

In [20]:
plt.figure(figsize=(10,5))
sns.heatmap(abs(data.corr()), cmap='GnBu', annot=True);

In [21]:
from sklearn.cluster import KMeans

In [22]:
SSE = []
for i in range(1,10):
    kmeans = KMeans(n_jobs = -1, n_clusters = i, init='k-means++')
    kmeans.fit(data.iloc[:,[0,1,2,3]])
    SSE.append(kmeans.inertia_)

In [23]:
df = pd.DataFrame({'Cluster':range(1,10), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(df['Cluster'], df['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia');
plt.title("'ELBOW METHOD TO DETERMINE OPTIMAL VALUE OF 'K'\n");

In [24]:
kmeans = KMeans(n_jobs = -1, n_clusters = 3, init='k-means++')
kmeans.fit(data.iloc[:,[0,1,2,3]])
kmeans.cluster_centers_

In [25]:

data['cluster'] = kmeans.labels_

data

In [26]:
display(data['cluster'].value_counts(), data['Species'].value_counts())

In [27]:
plt.figure(figsize=(10,5))
plt.scatter(data['SepalLengthCm'], data['SepalWidthCm'], c=data.cluster)
plt.title('Predicted Clusters\n')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1] ,  s=200, c='red', label = 'Centroids')
plt.show()

In [28]:
data.loc[data['Species']=='Iris-setosa']['cluster'].value_counts() ## the model has identified iris-setosa with 100% accuracy

In [29]:
data.loc[data['Species']=='Iris-versicolor']['cluster'].value_counts()

In [30]:
data.loc[data['Species']=='Iris-virginica']['cluster'].value_counts()

In [31]:
data['Species_encoded'] = data['Species'].apply(lambda x: 1 if x=='Iris-setosa' else 2 if x=='Iris-virginica' else 0)
data

In [32]:
from sklearn.metrics import classification_report
print(classification_report(data['Species_encoded'],data['cluster']))

In [33]:
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(data['Species_encoded'],data['cluster']), annot=True);