In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Define the CSV file name
csv_name = "csv3.csv"

# Load the dataset
dataframe = pd.read_csv(csv_name)

# Step 1: Drop duplicates
dataframe = dataframe.drop_duplicates()

# Step 2: Replace missing values with the mode for each column
for column in dataframe.columns:
    mode_value = dataframe[column].mode()[0]  # Get the mode of the column
    dataframe[column].fillna(mode_value, inplace=True)  # Replace NaNs with mode

# Step 3: Identify high cardinality columns (numeric columns with many unique values)
# Set a threshold for high cardinality (e.g., more than 50 unique values)
high_cardinality_cols = [col for col in dataframe.columns if dataframe[col].nunique() > 50]

# Drop high cardinality columns before scaling
# dataframe = dataframe.drop(columns=high_cardinality_cols)

# Step 4: Define features and target
target_column = "Class"  # Target column is 'Class'
features = dataframe.drop(columns=[target_column])
target = dataframe[target_column]

# Ensure features is not empty and contains only numeric columns
if features.empty:
    raise ValueError("Features dataframe is empty after dropping high cardinality columns.")

# Check for any non-numeric columns
non_numeric_cols = features.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    raise ValueError(f"The following columns are non-numeric: {non_numeric_cols}")

# Step 5: Scaling function
standardize = True

def scale(dataframe, standardize):
    if standardize:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    # Ensure the dataframe is not empty before scaling
    if dataframe.empty:
        raise ValueError("DataFrame is empty, cannot apply scaler.")
    return pd.DataFrame(scaler.fit_transform(dataframe), columns=dataframe.columns)

# Scale the features
features = scale(features, standardize)

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(mode_value, inplace=True)  # Replace NaNs with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(mode_value, inplace=True)  # Replace NaNs with mode


Training set shape: (226980, 30) (226980,)
