<a href="https://colab.research.google.com/github/Radhakrishna2055/SMLB-39/blob/main/Assignment11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_csv('breast_cancer_survival.csv')

# Encode categorical variables
data_encoded = data.copy()
label_encoders = {}

# Encode target variable 'Patient_Status'
label_encoders['Patient_Status'] = LabelEncoder()
data_encoded['Patient_Status'] = label_encoders['Patient_Status'].fit_transform(data['Patient_Status'])

# Encode other categorical columns
categorical_cols = ['Gender', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type']
for col in categorical_cols:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
X = data_encoded.drop(columns=['Patient_Status', 'Date_of_Surgery', 'Date_of_Last_Visit'])
y = data_encoded['Patient_Status']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Train models and calculate accuracy on original data
accuracies = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracies[model_name] = accuracy_score(y_test, y_pred)

print("Accuracies on original data:\n", accuracies)

# Apply PCA to reduce dimensions (keeping components that explain 95% of variance)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Split the PCA-transformed data into train and test sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train models and calculate accuracy on PCA-reduced data
pca_accuracies = {}
for model_name, model in models.items():
    model.fit(X_train_pca, y_train_pca)
    y_pred_pca = model.predict(X_test_pca)
    pca_accuracies[model_name] = accuracy_score(y_test_pca, y_pred_pca)

print("Accuracies on PCA-reduced data:\n", pca_accuracies)


Accuracies on original data:
 {'SVM': 0.7761194029850746, 'KNN': 0.7313432835820896, 'Logistic Regression': 0.7761194029850746}
Accuracies on PCA-reduced data:
 {'SVM': 0.7761194029850746, 'KNN': 0.7313432835820896, 'Logistic Regression': 0.7761194029850746}
