In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np


def preprocess_data(data, target_column, k_best=5):
  X = data.drop(target_column, axis=1)
  y = data[target_column]
  #ensure target variable is numeric
  if not np.issubdtype(y.dtype, np.number):
    y = pd.Categorical(y).codes
  selector = SelectKBest(f_classif, k=k_best)
  # Select top k features
  X_new = selector.fit_transform(X, y)
  selected_features = X.columns[selector.get-support()]
  X = pd.DataFrame(X_new, columns=selected_features)
  #Handling Missing Values (using SimpleImputer with mean strategy)
  imputer = SimpleImputer(strategy='mean')
  # Replace with median or most_frequent if needed
  discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='uniform')
  # Adjust n_bins
  try:
   X=pd.DataFrame(discretizer.fit_transform(X),columns=X.columns)
  except ValueError as e:
    print(f"Discretization Error: {e}")
    return None
  # 4. Elimination of Outlier(using iqr method)
  for col in X.columns:
   Q1 = X[col].quantile(0.25)
  Q3 = X[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  X = X[(X[col] >= lower_bound) & (X[col] <=upper_bound)]
  # Align y with the filtered index
  y = y[X.index]
  # Combine features and target into a new dataframe
  processed_data=pd.concat([X.reset_index(drop=True),y.reset_index(drop=True)], axis=1)
  return processed_data
  # Load the dataset
  data =pd.read_csv("/content/sample_data/breast_cancer.csv")
  # Preprocess the data
  preprocessed_data = preprocess_data(data, "Class")
  # Assuming "Class" is the name of the target variable
  #Print some info
  if preprocessed_data is not None:
   print(preprocessed_data.info())
   print(preprocessed_data.head())
  else:
   print("Preprocessing failed due to errors.")


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

def preprocess_data(data, target_column, k_best=5):
  X = data.drop(target_column, axis=1)
  y = data[target_column]
  #ensure target variable is numeric
  # The following lines were incorrectly indented and are now part of the function
  if not np.issubdtype(y.dtype, np.number):
    y = pd.Categorical(y).codes
  selector = SelectKBest(f_classif, k=k_best)
  # Select top k features
  X_new = selector.fit_transform(X, y)
  selected_features = X.columns[selector.get_support()]  # Assuming 'get_support' was intended
  X = pd.DataFrame(X_new, columns=selected_features)
  #Handling Missing Values (using SimpleImputer with mean strategy)
  imputer = SimpleImputer(strategy='mean')
  # Replace with median or most_frequent if needed
  discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
  # Adjust n_bins
  try:
    X = pd.DataFrame(discretizer.fit_transform(X), columns=X.columns)
  except ValueError as e:
    print(f"Discretization Error: {e}")
    return None
  # 4. Elimination of Outlier(using iqr method)
  for col in X.columns:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    X = X[(X[col] >= lower_bound) & (X[col] <= upper_bound)]
  # Align y with the filtered index
  y = y[X.index]
  # Combine features and target into a new dataframe
  processed_data = pd.concat([X.reset_index(drop=True), y.reset_index(drop=True)], axis=1)
  return processed_data

# Load the dataset
data = pd.read_csv("/content/sample_data/breast_cancer.csv")
# Preprocess the data
preprocessed_data = preprocess_data(data, "Class")  # Assuming "Class" is the target column name

# Print some info
if preprocessed_data is not None:
  print(preprocessed_data.info())
  print(preprocessed_data.head())
else:
  print("Preprocessing failed due to errors.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Uniformity of Cell Size   569 non-null    float64
 1   Uniformity of Cell Shape  569 non-null    float64
 2   Bare Nuclei               569 non-null    float64
 3   Bland Chromatin           569 non-null    float64
 4   Normal Nucleoli           569 non-null    float64
 5   Class                     569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB
None
   Uniformity of Cell Size  Uniformity of Cell Shape  Bare Nuclei  \
0                      0.0                       0.0          0.0   
1                      1.0                       1.0          4.0   
2                      0.0                       0.0          0.0   
3                      0.0                       0.0          0.0   
4                      0.0                       0.0   

In [None]:
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, mean_squared_error
 import pandas as pd # Importing pandas
 # Load your data
 data = pd.read_csv("/content/sample_data/breast_cancer.csv")

 # Split data into features and target variable
 # Change "target_column" to the actual name of your target column (e.g., 'Class')
 X = data.drop("Class", axis=1)  # Assuming 'Class' is the targetcolumn_name_
 y = data["Class"]  # Assuming 'Class' is the target column name
 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 # Classification
 knn_classifier = KNeighborsClassifier(n_neighbors=3)  # Adjust n_neighbors as needed
 knn_classifier.fit(X_train, y_train)
 y_pred_class = knn_classifier.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred_class)
 print("Accuracy:", accuracy)
 # Regression
 knn_regressor = KNeighborsRegressor(n_neighbors=3)  # Adjust n_neighbors as needed
 knn_regressor.fit(X_train, y_train)
 y_pred_reg = knn_regressor.predict(X_test)
 mse = mean_squared_error(y_test, y_pred_reg)
 print("Mean Squared Error:", mse)

Accuracy: 0.948905109489051
Mean Squared Error: 0.11354420113544202
