<a href="https://colab.research.google.com/github/Utkarshmishra2k2/Documentation-on-Classification-in-Machine-Learning/blob/main/Classification%20in%20Machine%20Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Code Structure**

1. Data Exploration and Visualization
    
    1.1. Loading the Dataset

    1.2. Data Cleaning

    1.3. Exploratory Data Analysis (EDA)

    1.4. Visualizations



2. Feature Engineering and Preprocessing

    2.1. Feature and Target Extraction

    2.2. Label Encoding

    2.3. Data Splitting

    2.4. Standardization



3. Model Training

    3.1. Logistic Regression:

    3.2. K-Nearest Neighbors (KNN):

    3.3. Support Vector Machine (SVM):

    3.4. Gaussian Naive Bayes:

    3.5. Bernoulli Naive Bayes:

    3.6. Decision Tree:

    3.7. Random Forest:

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.naive_bayes import GaussianNB,MultinomialNB ,BernoulliNB,ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.tree import plot_tree


In [None]:
import statsmodels.api as sm

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandas.plotting import andrews_curves,parallel_coordinates,radviz

In [None]:
iris = load_iris()
data_01 = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data_01['Species'] = iris.target_names[iris.target]

In [None]:
data_01.sample(10)

In [None]:
data_01.columns

In [None]:
data_01.rename(columns={'sepal length (cm)':'SepalLengthCm', 'sepal width (cm)':'SepalWidthCm', 'petal length (cm)':'PetalLengthCm', 'petal width (cm)':'PetalWidthCm'}, inplace=True)

In [None]:
data_01.describe(include='all').T

In [None]:
data_01.info()

In [None]:
data_01["Species"].unique()

In [None]:
numeric_columns = data_01.select_dtypes(include=[np.number]).columns
correlation_matrix = data_01[numeric_columns].corr()

fig = px.imshow(correlation_matrix.values,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='Blackbody',
                labels=dict(color='Correlation'))

annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        annotations.append(dict(x=correlation_matrix.columns[i], y=correlation_matrix.columns[j],
                                text=str(round(value, 2)),
                                font=dict(color='white' if abs(value) > 0.5 else 'black'),
                                showarrow=False))
fig.update_layout(annotations=annotations)

fig.update_layout(
    title='Correlation Heatmap of Features',
    xaxis_title='Features',
    yaxis_title='Features',
    width=700,
    height=600,
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    ),
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_coloraxes(colorbar=dict(
    title='Correlation',
    tickvals=[-1, 0, 1],
    ticktext=['Negative', 'Neutral', 'Positive'],
    ticks='outside'
))

fig.show()


In [None]:
fig = px.scatter(data_01, x = "SepalLengthCm", y = "SepalWidthCm", color = "Species",
                 title='Sepal Length vs Sepal Width by Species',
                 labels={"SepalLengthCm": 'Sepal Length (cm)', "SepalWidthCm": 'Sepal Width (cm)'},
                 hover_data=['Species'])

fig.show()

In [None]:
fig = px.scatter(data_01, x = "PetalLengthCm", y = "PetalWidthCm", color = "Species",
                 title='Petal Length vs Petal Width by Species',
                 labels={"PetalLengthCm": 'Petal Length (cm)', "PetalWidthCm": 'Petal Width (cm)'},
                 hover_data=['Species'])

fig.show()

In [None]:
fig = px.box(data_01, x='Species', y='PetalWidthCm', color='Species',
             title='Petal Width Distribution by Species',
             labels={'Species': 'Species', 'PetalWidthCm': 'Petal Width (cm)'},
             points='all')

fig2 = px.box(data_01, x='Species', y='PetalLengthCm', color='Species',
              title='Petal Length Distribution by Species',
              labels={'Species': 'Species', 'PetalLengthCm': 'Petal Length (cm)'},
              points='all')

fig3 = px.box(data_01, x='Species', y='SepalWidthCm', color='Species',
              title='Sepal Width Distribution by Species',
              labels={'Species': 'Species', 'SepalWidthCm': 'Sepal Width (cm)'},
              points='all')

fig4 = px.box(data_01, x='Species', y='SepalLengthCm', color='Species',
              title='Sepal Length Distribution by Species',
              labels={'Species': 'Species', 'SepalLengthCm': 'Sepal Length (cm)'},
              points='all')

In [None]:
fig_combined = make_subplots(rows=2, cols=2,
                             subplot_titles=['Petal Width Distribution',
                                              'Petal Length Distribution',
                                              'Sepal Width Distribution',
                                              'Sepal Length Distribution'])

for fig_obj, row, col in zip([fig, fig2, fig3, fig4], [1, 1, 2, 2], [1, 2, 1, 2]):
    for trace in fig_obj.data:
        fig_combined.add_trace(trace, row=row, col=col)


fig_combined.update_layout(title='Distribution of Iris Features by Species',
                           height=800, width=1000)

fig_combined.show()

In [None]:
plt.figure(figsize=(12, 12))
andrews_curves(data_01, "Species")
plt.title('Andrews Curves by Species')
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
parallel_coordinates(data_01, "Species")
plt.title('Parallel Coordinates by Species')
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
radviz(data_01, "Species")
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.lmplot(x="SepalWidthCm", y="SepalLengthCm", hue="Species", data=data_01, height=6, aspect=1.2);
plt.title('Scatter Plot of Sepal Width vs Sepal Length')
plt.xlabel('Sepal Width (cm)')
plt.ylabel('Sepal Length (cm)')
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.lmplot(x="PetalWidthCm", y="PetalLengthCm", hue="Species", data=data_01, height=6, aspect=1.2);
plt.title('Scatter Plot of Sepal Width vs Sepal Length')
plt.xlabel('Petal Width (cm)')
plt.ylabel('Petal Length (cm)')
plt.show()

In [None]:
X = data_01.iloc[:,0:4]
Y = data_01.iloc[:,4]

In [None]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.32, random_state = 101)

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Logastic Regression**

In [None]:
X_train_02 = X_train.copy()
X_train_02 = sm.add_constant(X_train_02)
X_test_02 = X_test.copy()
X_test_02 = sm.add_constant(X_test_02)

In [None]:
Logastics_01 = sm.MNLogit(Y_train, X_train_02)
result_01 = Logastics_01.fit()

In [None]:
print(result_01.summary())

In [None]:
y_prediction_probababilty = result_01.predict(X_test_02)
print(y_prediction_probababilty)

In [None]:
Y_pred_01 = np.argmax(y_prediction_probababilty, axis=1)

In [None]:
print(f'Accuracy: {accuracy_score(Y_test, Y_pred_01)}')

**K – Nearest Neighbour (KNN)**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(X_train, Y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print("Classification Report:")
print(classification_report(Y_test, y_pred, target_names=iris.target_names))

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred))

In [None]:
accuracy = knn.score(X_test, Y_test)
print(f"Accuracy: {accuracy:.2f}")

**Support Vector Machine**

In [None]:
SMV = SVC(kernel='linear')

In [None]:
SMV.fit(X_train, Y_train)

In [None]:
Y_pred_02 = SMV.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(Y_test, Y_pred_02))
print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, Y_pred_02))
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred_02))

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(Y_test, Y_pred_02), annot=True, cmap="YlGnBu", fmt='g', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from itertools import combinations

iris = datasets.load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

feature_pairs = list(combinations(range(X.shape[1]), 2))


fig, axs = plt.subplots(len(feature_pairs) // 2, 2, figsize=(12, 20))

for i, pair in enumerate(feature_pairs):
    X_pair = X[:, pair]

    clf = svm.SVC(kernel='linear')
    clf.fit(X_pair, y)

    xx, yy = np.meshgrid(np.linspace(X_pair[:, 0].min() - 1, X_pair[:, 0].max() + 1, 500),
                         np.linspace(X_pair[:, 1].min() - 1, X_pair[:, 1].max() + 1, 500))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax = axs[i // 2, i % 2]
    ax.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
    ax.scatter(X_pair[:, 0], X_pair[:, 1], c=y, cmap=plt.cm.Spectral, edgecolors='k')
    ax.set_xlabel(feature_names[pair[0]])
    ax.set_ylabel(feature_names[pair[1]])
    ax.set_title(f'{feature_names[pair[0]]} vs {feature_names[pair[1]]}')

plt.tight_layout()
plt.show()

**Gaussian Naive Bayes**

In [None]:
gnb_classifier = GaussianNB()

In [None]:
gnb_classifier.fit(X_train, Y_train)

In [None]:
Y_pred_03 = gnb_classifier.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred_03))

In [None]:
confusion_matrix_03 = metrics.confusion_matrix(Y_test, Y_pred_03)
print("Confusion Matrix:")
print(confusion_matrix)

In [None]:
class_report = metrics.classification_report(Y_test, Y_pred_03)
print("Classification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(confusion_matrix_03, annot=True, cmap="YlGnBu", fmt='g', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

**Bernoulli Naive Bayes**

In [None]:
bnb_classifier = BernoulliNB()

In [None]:
bnb_classifier.fit(X_train, Y_train)

In [None]:
Y_pred_07 = bnb_classifier.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred_07))

In [None]:
confusion_matrix_07 = metrics.confusion_matrix(Y_test, Y_pred_07)
print("Confusion Matrix:")
print(confusion_matrix)

In [None]:
class_report = metrics.classification_report(Y_test, Y_pred_07)
print("Classification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(confusion_matrix_07, annot=True, cmap="YlGnBu", fmt='g', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

**Decision Tree**

In [None]:
decision = DecisionTreeClassifier(random_state=42)

In [None]:
decision.fit(X_train, Y_train)

In [None]:
Y_pred_04 = decision.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred_04))

In [None]:
confusion_matrix_04 = metrics.confusion_matrix(Y_test, Y_pred_04)
print("Confusion Matrix:")
print(confusion_matrix_04)

In [None]:
class_report = metrics.classification_report(Y_test, Y_pred_04)
print("Classification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(confusion_matrix_04, annot=True, cmap="YlGnBu", fmt='g', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
plot_tree(decision, filled=True, feature_names=["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"], class_names=True)
plt.title("Decision Tree Visualization", fontsize=18)
plt.show()

**Random Forest**

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf_classifier.fit(X_train, Y_train)

In [None]:
tree_index = 0
chosen_tree = rf_classifier.estimators_[tree_index]

plt.figure(figsize=(16, 10))
plot_tree(chosen_tree, filled=True, feature_names=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], class_names=True, fontsize=10)
plt.title(f"Decision Tree {tree_index} from Random Forest", fontsize=14)
plt.show()

In [None]:
y_pred_05 = rf_classifier.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred_05))

In [None]:
confusion_matrix_05 = metrics.confusion_matrix(Y_test, y_pred_05)
print("Confusion Matrix:")
print(confusion_matrix_05)

In [None]:
class_report = metrics.classification_report(Y_test, y_pred_05)
print("Classification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(confusion_matrix_05, annot=True, cmap="YlGnBu", fmt='g', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
feature_importances = rf_classifier.feature_importances_
features = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
indices = np.argsort(feature_importances)[::-1]

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances[indices], y=np.array(features)[indices], palette='viridis')
plt.xlabel('Feature Importance Scores', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.title('Feature Importances', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
print("Session End")