<a href="https://colab.research.google.com/github/Vincenzo-Miracula/Zayed-University/blob/main/Kmeans_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## KMeans Clustering
Cluster analysis is a technique used in data mining and machine learning to group similar objects into clusters. K-means clustering is a widely used method for cluster analysis where the aim is to partition a set of objects into K clusters in such a way that the sum of the squared distances between the objects and their assigned cluster mean is minimized.

In [2]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import silhouette_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
home_data = pd.read_csv('https://raw.githubusercontent.com/Vincenzo-Miracula/Zayed-University/main/californiahousing.csv', sep=';', usecols = ['longitude', 'latitude', 'median_house_value'])
home_data.head()

In [None]:
home_data.shape

In [None]:
home_data.info()

In [None]:
home_data.isna().sum()

In [None]:
home_data.describe().T

In [None]:
sns.scatterplot(data = home_data, x = 'longitude', y = 'latitude', hue = 'median_house_value', palette='flare')

In [None]:
X = home_data[['latitude', 'longitude']]
y = home_data[['median_house_value']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
minmax = MinMaxScaler()
X_train_minmax = minmax.fit_transform(X_train)
X_test_minmax = minmax.transform(X_test)

In [None]:
# List to save inertia values
inertia_values = []
# Range of k values you wish to explore
k_values = range(2, 11)
# Calculation of inertia for each value of k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_train_minmax)
    inertia_values.append(kmeans.inertia_)
# Plot of the elbow method
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('Number of Cluster (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for the Choice of k')
plt.xticks(k_values)
plt.show()

In [None]:
k = 5  # Optimum number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train_minmax)

In [None]:
# Get cluster centers and labels
centroids = kmeans.cluster_centers_
labels = kmeans.labels_

In [None]:
sns.scatterplot(data=X_train_minmax, x='longitude', y='latitude', hue=kmeans.labels_, palette='flare')

In [None]:
# Evaluate the clustering using silhouette score
silhouette_avg = silhouette_score(X_train_minmax, kmeans.labels_)
print("Silhouette Score:", silhouette_avg)

# Get the inertia (within-cluster sum of squares)
inertia = kmeans.inertia_
print("Inertia:", inertia)

In [None]:
y_test['labels'] = kmeans.predict(X_test_minmax)

In [None]:
new_house = [[38.70, -120.49]]

In [None]:
kmeans.predict(new_house)

## Random Forest
Imagine you have a complex problem to solve, and you gather a group of experts from different fields to provide their input. Each expert provides their opinion based on their expertise and experience. Then, the experts would vote to arrive at a final decision.

In a random forest classification, multiple decision trees are created using different random subsets of the data and features. Each decision tree is like an expert, providing its opinion on how to classify the data. Predictions are made by calculating the prediction for each decision tree, then taking the most popular result.

In [39]:
cars = pd.read_csv('https://raw.githubusercontent.com/Vincenzo-Miracula/Zayed-University/main/car_evaluation.csv')

In [None]:
cars.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety','Evaluation']
cars.shape

In [None]:
#Let's check if there are any missing values in our dataset
cars.isnull().sum()

In [None]:
#We see that there are no missing values in our dataset
#Let's take a more analytical look at our dataset
cars.describe().T

In [None]:
#We realize that our data has categorical values
cars.columns

In [None]:
#Lets find out the number of cars in each evaluation category
cars['Evaluation'].value_counts().sort_index()

In [7]:
cars.Doors.replace(('5more'),('5'),inplace=True)
cars.Persons.replace(('more'),('5'),inplace=True)

In [None]:
for col in cars.columns:

    print(cars[col].value_counts())

In [8]:
X = cars.drop(['Evaluation'], axis=1)
y = cars['Evaluation']

In [None]:
# declare feature vector and target variable
#X = df.drop(['Evaluation', 'Doors'], axis=1)
#y = df['Evaluation']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.dtypes

In [None]:
!pip install category_encoders

In [11]:
import category_encoders as ce

In [12]:
encoder = ce.OrdinalEncoder(cols=['Buying', 'Maint', 'Doors', 'Persons', 'LugBoot', 'Safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [None]:
# instantiate the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model
clf.fit(X_train, y_train)

In [14]:
# Predict the Test set results
y_pred = clf.predict(X_test)

In [None]:
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [None]:
# Creating a seaborn bar plot
sns.barplot(x=feature_scores, y=feature_scores.index)
# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
# Add title to the graph
plt.title("Visualizing Important Features")
# Visualize the graph
plt.show()

In [17]:
new_car = [[1, 2, 3, 3, 1, 2]]

In [None]:
clf.predict(new_car)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)

In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
cr = classification_report(y_test, y_pred)
print('Classification report\n\n', cr)

In [None]:
from sklearn.tree import export_graphviz
import graphviz

In [None]:
for i in range(2):
  tree_in_forest = clf.estimators_[0]
  dot_data = export_graphviz(tree_in_forest,
                feature_names=X_train.columns,
                max_depth=5,
                filled=True,
                rounded=True)
  graph = graphviz.Source(dot_data)
  graph.render("decision_tree")
  display(graph)