In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [4]:
df=pd.read_excel("mines.xlsx")

In [6]:
df.head()

Unnamed: 0,V,H,S,M
0,0.338157,0.0,0.0,1
1,0.320241,0.181818,0.0,1
2,0.287009,0.272727,0.0,1
3,0.256284,0.454545,0.0,1
4,0.26284,0.545455,0.0,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V       338 non-null    float64
 1   H       338 non-null    float64
 2   S       338 non-null    float64
 3   M       338 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 10.7 KB


In [9]:
df.isnull().count

<bound method DataFrame.count of          V      H      S      M
0    False  False  False  False
1    False  False  False  False
2    False  False  False  False
3    False  False  False  False
4    False  False  False  False
..     ...    ...    ...    ...
333  False  False  False  False
334  False  False  False  False
335  False  False  False  False
336  False  False  False  False
337  False  False  False  False

[338 rows x 4 columns]>

In [10]:
df.shape

(338, 4)

# dataset was having no missing  values and was already normalized

In [11]:
X=df.drop(columns='M')
y=df['M']

In [12]:
X

Unnamed: 0,V,H,S
0,0.338157,0.000000,0.0
1,0.320241,0.181818,0.0
2,0.287009,0.272727,0.0
3,0.256284,0.454545,0.0
4,0.262840,0.545455,0.0
...,...,...,...
333,0.323262,0.909091,0.4
334,0.444108,0.181818,1.0
335,0.353474,0.454545,1.0
336,0.362537,0.727273,1.0


In [13]:
y

0      1
1      1
2      1
3      1
4      1
      ..
333    5
334    5
335    5
336    5
337    5
Name: M, Length: 338, dtype: int64

# Training on different Classifiers

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

In [31]:
# Train and evaluate each classifier
for name, clf in classifiers.items():
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    # Print the results
    print(f'{name} Accuracy: {accuracy:.4f}')

Logistic Regression Accuracy: 0.3676
Decision Tree Accuracy: 0.4706
SVM Accuracy: 0.4265
k-NN Accuracy: 0.3529
Naive Bayes Accuracy: 0.4412


# Hyper Parameter Tuning

In [45]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grids for Logistic Regression, SVM, and Decision Tree
logreg_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
svm_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
dt_param_grid = {'max_depth': [None, 5, 10, 15, 20]}
knn_param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}

# Initialize classifiers with default parameters
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

# Apply hyperparameter tuning for Logistic Regression, SVM, and Decision Tree, and k-NN
tuned_classifiers = {}
for name, clf in classifiers.items():
    if name == 'Logistic Regression':
        grid_search = GridSearchCV(clf, logreg_param_grid, cv=5)
    elif name == 'SVM':
        grid_search = GridSearchCV(clf, svm_param_grid, cv=5)
    elif name == 'Decision Tree':
        grid_search = GridSearchCV(clf, dt_param_grid, cv=5)
    elif name == 'k-NN':
        grid_search = GridSearchCV(clf, knn_param_grid, cv=5)
    else:
        # For models without hyperparameter tuning, use the default model
        grid_search = clf
    
    # Fit the model (with or without tuning)
    grid_search.fit(X_train, y_train)
    
    # Save the best model in the tuned_classifiers dictionary if available
    if hasattr(grid_search, 'best_estimator_'):
        tuned_classifiers[name] = grid_search.best_estimator_
    else:
        tuned_classifiers[name] = clf

# Evaluate each tuned classifier
for name, clf in tuned_classifiers.items():
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    # Print the results
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(f'Best Hyperparameters: {clf.get_params()}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\asus\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\asus\anaconda3\lib\site-p

Logistic Regression Accuracy: 0.4853
Best Hyperparameters: {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Decision Tree Accuracy: 0.4265
Best Hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
SVM Accuracy: 0.6029
Best Hyperparameters: {'C': 100, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
k-NN Accurac

# Trying to use cross-validation

In [57]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Assuming X and y are your feature and target arrays
# Replace these with your actual dataset
# X, y = ...

# Create a pipeline with feature scaling and SVM
svm_model = make_pipeline(StandardScaler(), SVC(C=100, kernel='rbf', gamma='scale'))

# Perform cross-validation
cv_scores = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())


Cross-Validation Scores: [0.61764706 0.58823529 0.5        0.67164179 0.76119403]
Mean Accuracy: 0.6277436347673397


In [46]:
best_svm_params = {'C': 100, 'break_ties': False, 'cache_size': 200, 'class_weight': None,
                   'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3,
                   'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False,
                   'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}

In [47]:
# Initialize SVM classifier with the best hyperparameters
svm_classifier = SVC(**best_svm_params)
# Train the model
svm_classifier.fit(X_train, y_train)

In [48]:
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)
# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'SVM Accuracy with Best Hyperparameters: {accuracy:.4f}')

SVM Accuracy with Best Hyperparameters: 0.6029


In [51]:
import pickle

In [52]:
pickle.dump(svm_classifier,open('mines_classification.pkl','wb'))

In [54]:
df.to_csv('mines_classification.csv')