<a href="https://colab.research.google.com/github/Shivamani162/2303A52344-STML/blob/main/Assignment_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Load the data
data = pd.read_csv('/content/drive/MyDrive/breast_cancer_survival.csv')

# Display the first few rows of the dataset and data types to investigate potential issues
print("First few rows of the dataset:")
print(data.head())

print("\nColumn data types:")
print(data.dtypes)

# Display column names to identify features and target variable
print("\nColumn names in the dataset:")
print(data.columns)

# Preprocess the data by handling non-numeric values and missing values
# Assuming 'Patient_Status' is the target variable and needs to be encoded
# Separate features (X) and target (y) first
X = data.drop('Patient_Status', axis=1)  # Features (all columns except 'Patient_Status')
y = data['Patient_Status']  # Target variable ('Patient_Status')
# Convert non-numeric columns in X to numeric, if needed
for column in X.select_dtypes(include=['object']).columns:
    try:
        X[column] = pd.to_numeric(X[column], errors='raise')  # Try to convert to numeric
    except ValueError:
        print(f"Column '{column}' could not be converted to numeric and will be dropped.")
        X = X.drop(column, axis=1)  # Drop the column if conversion fails


# Impute missing values using SimpleImputer for features (X)
imputer = SimpleImputer(strategy='mean')  # Create an imputer instance
X = imputer.fit_transform(X)  # Fit and transform the imputer on X

# Encode the target variable ('Patient_Status') if it's categorical
if y.dtype == 'object':  # Check if the target variable is of object type (categorical)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Train and evaluate KNN model with different test sizes
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
accuracy_results = {}

for size in test_sizes:
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=42)

    # Initialize and train KNN model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[size] = accuracy
    print(f"Accuracy with test size {size}: {accuracy:.2f}")

# Display the accuracy results for different test sizes
print("\nAccuracy results for different test sizes:")
for size, accuracy in accuracy_results.items():
    print(f"Test size {size}: {accuracy:.2f}")

First few rows of the dataset:
   Age  Gender  Protein1  Protein2  Protein3  Protein4 Tumour_Stage  \
0   42  FEMALE   0.95256   2.15000  0.007972 -0.048340           II   
1   54  FEMALE   0.00000   1.38020 -0.498030 -0.507320           II   
2   63  FEMALE  -0.52303   1.76400 -0.370190  0.010815           II   
3   78  FEMALE  -0.87618   0.12943 -0.370380  0.132190            I   
4   42  FEMALE   0.22611   1.74910 -0.543970 -0.390210           II   

                       Histology ER status PR status HER2 status Surgery_type  \
0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer  # Import SimpleImputer

# Load the data
data = pd.read_csv('/content/drive/MyDrive/breast_cancer_survival.csv')

# Display the first few rows of the dataset and data types to investigate potential issues
print("First few rows of the dataset:")
print(data.head())

print("\nColumn data types:")
print(data.dtypes)

# Preprocess the data by converting non-numeric columns to numeric and handling missing values
# --- Changed to only apply to_numeric to columns that are not the target variable ---
for column in data.columns:
    if column != 'Patient_Status':  # Exclude the target variable
        data[column] = pd.to_numeric(data[column], errors='coerce')

# Separate features (X) and target (y) before imputation
X = data.iloc[:, :-1]  # Features (all columns except the last)
y = data.iloc[:, -1]   # Target variable (last column)

# --- Drop rows with NaN in the target variable ---
data = data.dropna(subset=['Patient_Status'])
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Create an imputer instance
X = imputer.fit_transform(X)  # Fit and transform the imputer on X

# Define test sizes as percentages (0.20, 0.25, etc.)
test_sizes = [0.20, 0.25, 0.30, 0.35]
accuracy_results = {}

for size in test_sizes:
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=42)

    # Initialize and train the SVM model
    svm = SVC(kernel='linear')  # You can also try 'rbf' or other kernels
    svm.fit(X_train, y_train)

    # Make predictions
    y_pred = svm.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[size] = accuracy
    print(f"Accuracy with test size {int(size*100)}%: {accuracy:.2f}")

# Display the accuracy results for different test sizes
print("\nAccuracy results for different test sizes:")
for size, accuracy in accuracy_results.items():
    print(f"Test size {int(size*100)}%: {accuracy:.2f}")

First few rows of the dataset:
   Age  Gender  Protein1  Protein2  Protein3  Protein4 Tumour_Stage  \
0   42  FEMALE   0.95256   2.15000  0.007972 -0.048340           II   
1   54  FEMALE   0.00000   1.38020 -0.498030 -0.507320           II   
2   63  FEMALE  -0.52303   1.76400 -0.370190  0.010815           II   
3   78  FEMALE  -0.87618   0.12943 -0.370380  0.132190            I   
4   42  FEMALE   0.22611   1.74910 -0.543970 -0.390210           II   

                       Histology ER status PR status HER2 status Surgery_type  \
0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0

 'Surgery_type' 'Date_of_Surgery' 'Date_of_Last_Visit']. At least one non-missing value is needed for imputation with strategy='mean'.


Accuracy with test size 20%: 0.77
Accuracy with test size 25%: 0.74
Accuracy with test size 30%: 0.75
Accuracy with test size 35%: 0.75

Accuracy results for different test sizes:
Test size 20%: 0.77
Test size 25%: 0.74
Test size 30%: 0.75
Test size 35%: 0.75
