In [None]:
import requests

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
response = requests.get(url)

with open("wine_dataset.csv", "wb") as f:
    f.write(response.content)

print("Dataset downloaded and saved as 'wine_dataset.csv'")



In [None]:
import pandas as pd

column_names = [
    "Class",
    "Alcohol",
    "Malic Acid",
    "Ash",
    "Alcalinity of Ash",
    "Magnesium",
    "Total Phenols",
    "Flavanoids",
    "Nonflavanoid Phenols",
    "Proanthocyanins",
    "Color Intensity",
    "Hue",
    "OD280/OD315 of Diluted Wines",
    "Proline"
]

df = pd.read_csv("wine_dataset.csv", header=None, names=column_names)
print(df.head())

In [None]:
import pandas as pd

# Define column names for the dataset
column_names = [
    "Class",
    "Alcohol",
    "Malic Acid",
    "Ash",
    "Alcalinity of Ash",
    "Magnesium",
    "Total Phenols",
    "Flavanoids",
    "Nonflavanoid Phenols",
    "Proanthocyanins",
    "Color Intensity",
    "Hue",
    "OD280/OD315 of Diluted Wines",
    "Proline"
]

# Load the dataset into a Pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
df = pd.read_csv(url, header=None, names=column_names)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
import pandas as pd

# Define column names for the dataset
column_names = [
    "Class",
    "Alcohol",
    "Malic Acid",
    "Ash",
    "Alcalinity of Ash",
    "Magnesium",
    "Total Phenols",
    "Flavanoids",
    "Nonflavanoid Phenols",
    "Proanthocyanins",
    "Color Intensity",
    "Hue",
    "OD280/OD315 of Diluted Wines",
    "Proline"
]

# Load the dataset into a Pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
df = pd.read_csv(url, header=None, names=column_names)

# Split the dataset into features (X) and target variable (y)
X = df.drop("Class", axis=1)  # Features
y = df["Class"]  # Target variable

# Display the first few rows of the features and target variables
print("Features:")
print(X.head())

print("\nTarget Variable:")
print(y.head())

In [None]:

from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Display the first few rows of the scaled features
print("Scaled Features:")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer instance
imputer = SimpleImputer(strategy="mean")

# Impute missing values (this step is not necessary for the Wine dataset)
X_imputed = imputer.fit_transform(X)

# Display the first few rows of the imputed features
print("Imputed Features:")
print(pd.DataFrame(X_imputed, columns=X.columns).head())


In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# Preprocessed dataset (scaled or imputed)
X_preprocessed = X_scaled  # Use X_scaled or X_imputed depending on your preprocessing choice

# Initialize PCA with the desired number of components
num_components = 2  # Choose the number of components you want
pca = PCA(n_components=num_components)

# Fit PCA to the preprocessed data and transform the data
X_pca = pca.fit_transform(X_preprocessed)

# Create a DataFrame for the transformed data with PCA components
pca_column_names = [f"PC{i+1}" for i in range(num_components)]
X_pca_df = pd.DataFrame(X_pca, columns=pca_column_names)

# Display the first few rows of the PCA-transformed data
print("PCA Transformed Features:")
print(X_pca_df.head())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Preprocessed dataset (scaled or imputed)
X_preprocessed = X_scaled  # Use X_scaled or X_imputed depending on your preprocessing choice

# Initialize PCA
pca = PCA()

# Fit PCA to the preprocessed data
pca.fit(X_preprocessed)

# Calculate cumulative explained variance ratio
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Plot the cumulative explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio vs. Number of Principal Components')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Preprocessed dataset (scaled or imputed)
X_preprocessed = X_scaled  # Use X_scaled or X_imputed depending on your preprocessing choice

# Initialize PCA with the desired number of components
num_components = 2  # Choose the number of components you want
pca = PCA(n_components=num_components)

# Fit PCA to the preprocessed data and transform the data
X_pca = pca.fit_transform(X_preprocessed)

# Create a DataFrame for the transformed data with PCA components
pca_column_names = [f"PC{i+1}" for i in range(num_components)]
X_pca_df = pd.DataFrame(X_pca, columns=pca_column_names)

# Add the "Class" column from the original dataset to the PCA-transformed DataFrame
X_pca_df["Class"] = y

# Set up the plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=X_pca_df, palette="viridis")
plt.title("PCA Scatter Plot")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Class")
plt.show()

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessed dataset (scaled or imputed)
X_preprocessed = X_scaled  # Use X_scaled or X_imputed depending on your preprocessing choice

# Initialize PCA with the desired number of components
num_components = 2  # Choose the number of components you want
pca = PCA(n_components=num_components)

# Fit PCA to the preprocessed data and transform the data
X_pca = pca.fit_transform(X_preprocessed)

# Perform K-Means clustering on the PCA-transformed data
num_clusters = 3  # Choose the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_pca)

# Add the cluster labels to the PCA-transformed DataFrame
X_pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(num_components)])
X_pca_df["Cluster"] = cluster_labels

# Set up the plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=X_pca_df, palette="Set1", s=100)
plt.title("K-Means Clustering on PCA-Transformed Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()

In [None]:
Sure, let's interpret the results of the PCA and clustering analysis:

PCA Results:
Principal Component Analysis (PCA) is a dimensionality reduction technique that aims
to capture the most important information in the original data by projecting it onto
a lower-dimensional space. In the context of the Wine dataset, you applied PCA to reduce 
the original features into a two-dimensional space. The two principal components represent
linear combinations of the original features that maximize the explained variance.

PCA Components: The first principal component (PC1) captures the direction of maximum variance
in the data, while the second principal component (PC2) captures the second highest variance 
orthogonal to the first component. Each principal component is a linear combination of the original features.

Explained Variance Ratio: You may observe that the cumulative explained variance ratio plot 



shows a curve that flattens out after the first few components. The "elbow point" on the plot 
suggests a point of diminishing returns where adding more components doesn't contribute significantly
to the explained variance. This can guide you in choosing an appropriate number of components for dimensionality reduction.

Clustering Results (K-Means):
Clustering is an unsupervised learning technique that groups similar data points together based
on a certain similarity measure. In this case, you applied K-Means clustering to the PCA-transformed
data to group similar wine samples together.

Number of Clusters: You chose a specific number of clusters (e.g., 3 clusters) for the K-Means algorithm. 
Each cluster represents a group of data points that are similar to each other based on their PCA-transformed features.

Cluster Interpretation: By examining the scatter plot of the clustered data points, you can observe how the 
K-Means algorithm has partitioned the data into distinct groups in the reduced-dimensional space.
Each cluster is represented by a different color on the plot. The separation of clusters implies
that the K-Means algorithm has found distinct patterns or groups in the data based on the PCA-transformed features.

Remember that interpretation might depend on domain knowledge. In a real-world scenario, you would 
often try to assign meaning to the clusters based on your understanding of the data and the problem 
you're trying to solve. Visualizations and statistical measures can help provide insights, but ultimately,
the interpretation should be informed by the context of the data and the specific goals of your analysis.

In [None]:
..........................The End..........................