In [1]:
import pandas as pd
from path import Path

# Pandas Refresher

In [2]:
filePath = Path('../Resources/iris.csv')
irisDf = pd.read_csv(filePath)
irisDf.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
irisDf = irisDf.drop(['class'], axis=1)
irisDf.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
irisDf = irisDf[['sepal_length', 'petal_length', 'sepal_width', 'petal_width']]
irisDf.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [5]:
OutputPath = 'Resources/new_iris_data.csv'
irisDf.to_csv(OutputPath, index=False)

OSError: Cannot save file into a non-existent directory: 'Resources'

# Shopping Data Preprocessing

In [None]:
# Loading the shopping data from "shopping_data.csv"
filePath = Path('../Resources/shopping_data.csv')
shoppingDf = pd.read_csv(filePath, encoding='ISO-8859-1')
shoppingDf.head()

In [None]:
# Checking Columns
shoppingDf.columns

In [None]:
# Checking column data types
shoppingDf.dtypes

In [None]:
# Find null values
for column in shoppingDf.columns:
    print(f"Column {column} has {shoppingDf[column].isnull().sum()} null values")

In [None]:
# Dropping null rows
shoppingDf = shoppingDf.dropna()

In [None]:
# Checking for duplicates
print(f'Duplicate Entries {shoppingDf.duplicated().sum()}')

In [None]:
# Removing customer ID column
shoppingDf.drop(columns=['CustomerID'], inplace=True)
shoppingDf.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding Card Member column (1=Yes, 0=No)
le = LabelEncoder()

shoppingDfEncoded = shoppingDf.copy()

shoppingDfEncoded['Card Member'] = le.fit_transform(shoppingDfEncoded['Card Member'])

shoppingDfEncoded.head()

In [None]:
# Scaling down the Annual income column
shoppingDfEncoded['Annual Income'] = shoppingDfEncoded['Annual Income'] / 1000


shoppingDfEncoded.columns = shoppingDfEncoded.columns.str.replace(' ', '_')

shoppingDfEncoded.rename(columns={'Spending_Score_(1-100)': 'Spending_Score'}, inplace=True)

shoppingDfEncoded.head()

In [None]:
# Saving Cleaned Data
file_path = '../Resources/shopping_data_cleaned.csv'
shoppingDfEncoded.to_csv(file_path, index=False)

# KMeans

In [None]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
# Initializing model with K = 3 (Since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3,random_state=5)
model

In [None]:
# Fitting the model
model.fit(irisDf)

In [None]:
predict = model.predict(irisDf)
print(predict)

In [None]:
# Add a new class column to the df_iris
irisDf['class'] = model.labels_
irisDf.head()

In [None]:
import plotly.express as px
import hvplot.pandas

# Plotting the clusters with two features
irisDf.hvplot.scatter(x='sepal_length',y='sepal_width', by='class')

In [None]:
# plotting the clusters with three features (3D)
fig = px.scatter_3d(
    irisDf,
    x='petal_width',
    y='sepal_length',
    z='petal_length',
    color='class',
    symbol='class',
    size='sepal_width',
    width=800,
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()