<a href="https://colab.research.google.com/github/spoorthi0802/Machinelearning/blob/main/ML_LAB09_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_excel('Parkisons Dataset 1 Final (1).xlsx')

# Identify non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=['number']).columns.tolist()

# Remove non-numeric columns
numeric_data = data.drop(non_numeric_columns, axis=1)

# Preprocess the numeric data (scaling or normalization might be needed)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(numeric_data)

# K-means clustering with k = 3
kmeans3 = KMeans(n_clusters=3)
kmeans3.fit(data_scaled)

# K-means clustering with k = 5
kmeans5 = KMeans(n_clusters=5)
kmeans5.fit(data_scaled)

# Check inertia or silhouette scores for evaluating the clustering
print("K=3 Inertia:", kmeans3.inertia_)
print("K=5 Inertia:", kmeans5.inertia_)


In [None]:
# Determine the ideal k value using the Elbow Method
distortions = []
K = range(1, 31)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data_scaled)
    distortions.append(kmeans.inertia_)

# Plotting the Elbow graph
plt.figure(figsize=(8, 6))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average Distortion')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
import scipy.cluster.hierarchy as sch

# Hierarchical clustering using Agglomerative Clustering
plt.figure(figsize=(10, 7))
dendrogram = sch.dendrogram(sch.linkage(data_scaled, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
# Separate features and target

X = data.drop(columns=['status'])  # Features
y = data['status']  # Target

# Select only numeric columns for feature selection
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()
X_numeric = X[numeric_columns]

# Scale numeric data for feature selection
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Sequential Feature Selector
selector = SequentialFeatureSelector(LinearRegression(), n_features_to_select=1)
selector.fit(X_scaled, y)

# Get selected features
selected_features = X_numeric.columns[selector.support_]


In [None]:
from sklearn.decomposition import PCA

# Perform PCA on scaled data
pca = PCA(n_components=0.95)  # Captures 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Get the number of components/features needed for 95% variance
num_features_for_95_variance = pca.n_components_

# Print the number of components/features needed for 95% variance
print("Number of features needed for 95% variance in PCA:", num_features_for_95_variance)


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# PCA with K components
K = 10  # Update with the desired number of components
pca = PCA(n_components=K)
X_pca = pca.fit_transform(X_scaled)

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Model (Logistic Regression as an example)
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Calculate accuracy using the transformed dataset
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using transformed dataset:", accuracy)