In [None]:
#import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [None]:
# Load the provided dataset
df = pd.read_csv("/content/data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
# Get information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [None]:
# Drop unnecessary columns
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

In [None]:
# Encode the target variable 'diagnosis'
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [None]:
# Separate features and the target variable
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [None]:
# Split the dataset into training set and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features in it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Task 1: Without applying LDA
clf_without_lda = LogisticRegression()
clf_without_lda.fit(X_train_scaled, y_train)
y_pred_without_lda = clf_without_lda.predict(X_test_scaled)
accuracy_without_lda = accuracy_score(y_test, y_pred_without_lda)
print("Accuracy without LDA:", accuracy_without_lda)

Accuracy without LDA: 0.9736842105263158


In [None]:
# Task 2: Apply LDA
max_components = min(X.shape[1], len(set(y)) - 1)  # Maximum number of components allowed
for n_components in range(1, max_components + 1):
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    X_train_lda = lda.fit_transform(X_train_scaled, y_train)
    X_test_lda = lda.transform(X_test_scaled)

    # Training and testing the model with LDA
    clf_with_lda = LogisticRegression()
    clf_with_lda.fit(X_train_lda, y_train)
    y_pred_with_lda = clf_with_lda.predict(X_test_lda)
    accuracy_with_lda = accuracy_score(y_test, y_pred_with_lda)
    print(f"Accuracy with LDA (n_components={n_components}):", accuracy_with_lda)

Accuracy with LDA (n_components=1): 0.9473684210526315


In [None]:
# Display the coefficients of the linear discriminants
print(f"Coefficients of linear discriminants (n_components={n_components}):")
print(lda.coef_)

# Display the eigenvalues of the within-class scatter matrix
print(f"Eigenvalues of within-class scatter matrix (n_components={n_components}):")
print(lda.scalings_)

Coefficients of linear discriminants (n_components=1):
[[-13.43340326   0.22977529  10.67623077   2.24801544  -0.11041696
   -5.05798035   1.54055398   3.60578753  -0.17926593   0.80789588
    2.3589761   -0.1318957   -0.32921703  -1.13348282   1.06506924
   -0.7917907   -2.64773626   2.20276539  -0.18812788  -0.16179357
   19.88412614   1.12416477  -5.60358921 -10.51669522   0.05889316
    2.27851427   2.48493683  -1.66969972   1.24281704   0.4381033 ]]
Eigenvalues of within-class scatter matrix (n_components=1):
[[-3.46507764]
 [ 0.05926936]
 [ 2.7538791 ]
 [ 0.57986408]
 [-0.02848149]
 [-1.30468015]
 [ 0.39737802]
 [ 0.93009444]
 [-0.04624073]
 [ 0.20839261]
 [ 0.60848582]
 [-0.03402182]
 [-0.08491985]
 [-0.2923761 ]
 [ 0.27472916]
 [-0.20423836]
 [-0.68297002]
 [ 0.56819206]
 [-0.04852662]
 [-0.04173382]
 [ 5.1290086 ]
 [ 0.28997255]
 [-1.44541717]
 [-2.71272772]
 [ 0.01519119]
 [ 0.5877311 ]
 [ 0.64097674]
 [-0.4306905 ]
 [ 0.3205783 ]
 [ 0.1130065 ]]


In [None]:
# Analyze the results
print("\nAnalysis:")
print("The accuracy score tells us how well our model is performing. A higher accuracy score means our model is making more correct predictions. By changing the number of components in LDA or PCA, we can see how the model's performance changes as we reduce the dimensionality of the data.")

print("\nComparison with PCA:")
print("When using PCA, we observed how the model's performance changed with different levels of dimensionality reduction. This helps us understand the trade-offs between dimensionality reduction and model accuracy.")

print("\nComparison with LDA:")
print("Similarly, when using LDA, we observed how the model's performance changed with different numbers of components. This allows us to compare the effectiveness of LDA and PCA in improving the model's accuracy.")



Analysis:
The accuracy score tells us how well our model is performing. A higher accuracy score means our model is making more correct predictions. By changing the number of components in LDA or PCA, we can see how the model's performance changes as we reduce the dimensionality of the data.

Comparison with PCA:
When using PCA, we observed how the model's performance changed with different levels of dimensionality reduction. This helps us understand the trade-offs between dimensionality reduction and model accuracy.

Comparison with LDA:
Similarly, when using LDA, we observed how the model's performance changed with different numbers of components. This allows us to compare the effectiveness of LDA and PCA in improving the model's accuracy.
