In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the sonar dataset
try:
    data = pd.read_csv("sonar.all-data-uci.csv")
except FileNotFoundError:
    print("Error: The file 'sonar.all-data-uci.csv' could not be found. "
          "Please ensure it's in the same directory or provide the correct path.")
    exit()

# Check for missing values
print("Missing values per column:")
print(data.isnull().sum())

# Handle missing values (if necessary)
# You can choose an appropriate imputation method (e.g., mean/median imputation)
# or remove rows with missing values based on your data and modeling requirements.
# For example, to drop rows with missing values:
# data.dropna(inplace=True)

# Explore data distribution (optional)
print("\nData summary:")
print(data.describe())

# Separate data and labels
X = data.drop(columns='Label', axis=1)
y = data['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=2)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model performance
train_accuracy = accuracy_score(y_train, model.predict(X_train))
test_accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"\nTraining accuracy: {train_accuracy:.2f}")
print(f"Test accuracy: {test_accuracy:.2f}")

# Make predictions on new data
input_data = (0.0587, 0.121, 0.1268, 0.1498, 0.1436, 0.0561, 0.0832, 0.0672, 0.1372, 0.2352, 0.3208, 0.4257, 0.5201, 0.4914, 0.595, 0.7221, 0.9039, 0.9111, 0.8723, 0.7686, 0.7326, 0.5222, 0.3097, 0.3172, 0.227, 0.164, 0.1746, 0.1835, 0.2048, 0.1674, 0.2767, 0.3104, 0.3399, 0.4441, 0.5046, 0.2814, 0.1681, 0.2633, 0.3198, 0.1933, 0.0934, 0.0443, 0.078, 0.0722, 0.0405, 0.0553, 0.1081, 0.1139, 0.0767, 0.0265, 0.0215, 0.0331, 0.0111, 0.0088, 0.0158, 0.0122, 0.0038, 0.0101, 0.0228, 0.0124)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
prediction = model.predict(input_data_reshaped)
print(f"\nPrediction: {prediction}")

Missing values per column:
Freq_1     0
Freq_2     0
Freq_3     0
Freq_4     0
Freq_5     0
          ..
Freq_57    0
Freq_58    0
Freq_59    0
Freq_60    0
Label      0
Length: 61, dtype: int64

Data summary:
           Freq_1      Freq_2      Freq_3      Freq_4      Freq_5      Freq_6  \
count  208.000000  208.000000  208.000000  208.000000  208.000000  208.000000   
mean     0.029164    0.038437    0.043832    0.053892    0.075202    0.104570   
std      0.022991    0.032960    0.038428    0.046528    0.055552    0.059105   
min      0.001500    0.000600    0.001500    0.005800    0.006700    0.010200   
25%      0.013350    0.016450    0.018950    0.024375    0.038050    0.067025   
50%      0.022800    0.030800    0.034300    0.044050    0.062500    0.092150   
75%      0.035550    0.047950    0.057950    0.064500    0.100275    0.134125   
max      0.137100    0.233900    0.305900    0.426400    0.401000    0.382300   

           Freq_7      Freq_8      Freq_9     Freq_10  ...  

