In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load by manual use this code

In [14]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/zafari /Wholesale customers data.csv')

# Add metadata (Name, Student Number, GitHub Link)
metadata = {
    "Name": "shehryar",
    "Student Number": "23019900",
    "GitHub Repository": "https://github.com/shehryar/ClusteringAndFittingReport"
}
print(metadata)

# Check dataset
print(df.head())

# Function 1: Plot Histogram
def plot_histogram(data, column, filename):
    plt.figure(figsize=(8, 6))
    plt.hist(data[column], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Histogram of {column}', fontsize=16)
    plt.xlabel(column, fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Function 2: Plot Scatter Plot
def plot_scatter(data, x_col, y_col, filename):
    plt.figure(figsize=(8, 6))
    plt.scatter(data[x_col], data[y_col], c='blue', alpha=0.6)
    plt.title(f'Scatter Plot: {x_col} vs {y_col}', fontsize=16)
    plt.xlabel(x_col, fontsize=14)
    plt.ylabel(y_col, fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Function 3: Plot Heatmap
def plot_heatmap(data, filename):
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap', fontsize=16)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Function 4: Plot Elbow Plot for K-Means
def plot_elbow(data, max_k, filename):
    inertia = []
    K_range = range(1, max_k + 1)
    for k in K_range:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(data)
        inertia.append(km.inertia_)
    plt.figure(figsize=(8, 6))
    plt.plot(K_range, inertia, marker='o', linestyle='--', color='b')
    plt.title('Elbow Plot for Optimal K', fontsize=16)
    plt.xlabel('Number of Clusters (K)', fontsize=14)
    plt.ylabel('Inertia', fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Function 5: Plot Violin Plot
def plot_violin(data, column, filename):
    plt.figure(figsize=(8, 6))
    sns.violinplot(x=data[column], color='skyblue')
    plt.title(f'Violin Plot of {column}', fontsize=16)
    plt.xlabel(column, fontsize=14)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Generate Plots
plot_histogram(df, 'Fresh', 'histogram_fresh.png')
plot_scatter(df, 'Fresh', 'Grocery', 'scatter_fresh_grocery.png')
plot_heatmap(df, 'heatmap_correlation.png')

# Ensure only numerical columns are passed for clustering and elbow plot
numerical_data = df.select_dtypes(include=[np.number]).iloc[:, 2:]
plot_elbow(numerical_data, max_k=10, filename='elbow_plot.png')

# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(numerical_data)

# Line Fitting
X = df[['Fresh']]
y = df['Grocery']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

# Print regression results
print("Regression Coefficient:", regressor.coef_)
print("Regression Intercept:", regressor.intercept_)

# Generate Violin Plot
plot_violin(df, 'Fresh', 'violin_fresh.png')

{'Name': 'shehryar', 'Student Number': '23019900', 'GitHub Repository': 'https://github.com/shehryar/ClusteringAndFittingReport'}
   Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0        2       3  12669  9656     7561     214              2674        1338
1        2       3   7057  9810     9568    1762              3293        1776
2        2       3   6353  8808     7684    2405              3516        7844
3        1       3  13265  1196     4221    6404               507        1788
4        2       3  22615  5410     7198    3915              1777        5185
Regression Coefficient: [-0.00775116]
Regression Intercept: 8255.555429812948
