In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('sonar.csv', header=None) # we use the header attribute as there are no column names
df.head()

In [None]:
# Information about the data
df.shape
df.info()
df.describe()

In [None]:
# Data preprocessing
# 1. Dealing with missing values
df.isnull().sum()
df.columns
# 2. Visualising the data to find information about the target variable
df[60].value_counts()
sns.histplot(df[60])
plt.xlabel("Rock v/s Mine")
plt.ylabel("Count")
plt.show()

In [None]:
# 3. One hot encoding for categorical values
df=pd.get_dummies(df,columns=[60],drop_first=True)
df.head()

In [None]:
df_2=df.drop(columns=['60_R'])
df_2.head()
df_2.corrwith(df['60_R']).plot.bar(
    figsize=(20,10), title="Correlation", rot=45, grid=True
)
plt.show()

In [None]:
# 4. Establishing a correlation matrix
corr=df.corr()
corr

In [None]:
# 5. Plotting a heatmap of the correlation matrix
plt.figure(figsize=(20,10))
sns.heatmap(corr,annot=False)

In [None]:
x=df.drop(columns='60_R')
y=df['60_R']
y.shape
x.shape

In [None]:
# Splitting into training and testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
x_train.shape
x_test.shape
y_train.shape
y_test.shape

In [59]:
# Building the model
from sklearn.linear_model import LogisticRegression
clf_1=LogisticRegression()
clf_1.fit(x_train,y_train)
y_pred=clf_1.predict(x_test)

In [63]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
acc=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)

In [64]:
results=pd.DataFrame([['Logistic Regression',acc,f1,precision,recall]],
                     columns=['Model','Accuracy Score','F1 Score','Precision','Recall'])
results

Unnamed: 0,Model,Accuracy Score,F1 Score,Precision,Recall
0,Logistic Regression,0.833333,0.810811,0.9375,0.714286


In [66]:
from sklearn.model_selection import cross_val_score
cv_score=cross_val_score(clf_1,x_train,y_train,cv=10)
accuracy=np.mean(cv_score*100)
deviation=np.std(cv_score*100)
print(accuracy,"%")
print(deviation,"%")

76.54411764705883 %
7.317561943896896 %


In [68]:
from sklearn.ensemble import RandomForestClassifier
clf_2=RandomForestClassifier()
clf_2.fit(x_train,y_train)
y_pred2=clf_2.predict(x_test)

In [69]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
acc=accuracy_score(y_test,y_pred2)
f1=f1_score(y_test,y_pred2)
precision=precision_score(y_test,y_pred2)
recall=recall_score(y_test,y_pred2)

In [None]:
results=pd.DataFrame([['Random Forest',acc,f1,precision,recall]],
                     columns=['Model','Accuracy Score','F1 Score','Precision','Recall'])
results

In [71]:
from sklearn.model_selection import cross_val_score
cv_score=cross_val_score(clf_2,x_train,y_train,cv=10)
accuracy=np.mean(cv_score*100)
deviation=np.std(cv_score*100)
print(accuracy,"%")
print(deviation,"%")

81.80147058823529 %
9.698150329299322 %


In [72]:
# Building the final model
from sklearn.ensemble import RandomForestClassifier
clf_final=RandomForestClassifier()
clf_final.fit(x_train,y_train)
y_pred_final=clf_final.predict(x_test)

In [73]:
# Predicting a single value
single_obs=[[0.0114,0.0222,0.0269,0.0384,0.1217,0.2062,0.1489,0.0929,0.1350,0.1799,0.2486,0.2973,0.3672,0.4394,0.5258,0.6755,0.7402,0.8284,0.9033,0.9584,1.0000,0.9982,0.8899,0.7493,0.6367,0.6744,0.7207,0.6821,0.5512,0.4789,0.3924,0.2533,0.1089,0.1390,0.2551,0.3301,0.2818,0.2142,0.2266,0.2142,0.2354,0.2871,0.2596,0.1925,0.1256,0.1003,0.0951,0.1210,0.0728,0.0174,0.0213,0.0269,0.0152,0.0257,0.0097,0.0041,0.0050,0.0145,0.0103,0.0025]]
prediction=clf_final.predict(single_obs)
if(prediction==1):
  print("It is a rock")
else:
  print("It is a mine")

It is a mine
