In [3]:
import pandas as pd

# Read the Excel file into a DataFrame
soil_data = pd.read_excel('Soil_Data.xlsx') #typo Contamination

# Display the first few rows to verify the import
print(soil_data.head())

   soilMoisture  soil_pHSensor  soilTemperature  soil_GasValue  status
0            36           4.76            28.44              5       0
1            36           4.13            28.44              5       0
2            37           3.75            28.44              5       0
3            38           3.37            28.44              5       0
4            38           3.23            28.44              5       0


In [4]:
print(soil_data.columns)

Index(['soilMoisture', 'soil_pHSensor', 'soilTemperature', 'soil_GasValue',
       'status'],
      dtype='object')


In [5]:
soil_data = soil_data[soil_data['soil_pHSensor'] != 7.16]

In [6]:
print(soil_data['soil_pHSensor'].dtype)

float64


In [7]:
rows_with_716 = soil_data[soil_data['soil_pHSensor'] == 7.16]
print(rows_with_716)

Empty DataFrame
Columns: [soilMoisture, soil_pHSensor, soilTemperature, soil_GasValue, status]
Index: []


In [8]:
soil_data = soil_data.loc[:, ~soil_data.columns.str.contains('Unnamed')]

In [9]:
print(soil_data.head())

   soilMoisture  soil_pHSensor  soilTemperature  soil_GasValue  status
0            36           4.76            28.44              5       0
1            36           4.13            28.44              5       0
2            37           3.75            28.44              5       0
3            38           3.37            28.44              5       0
4            38           3.23            28.44              5       0


In [10]:
print(soil_data.shape)

(252, 5)


In [11]:
# Convert 'soil_pHSensor' column to numeric (assuming it contains numeric values)
soil_data['soil_pHSensor'] = pd.to_numeric(soil_data['soil_pHSensor'], errors='coerce')

# Convert 'soilTemperature' column to numeric (assuming it contains numeric values)
soil_data['soilTemperature'] = pd.to_numeric(soil_data['soilTemperature'], errors='coerce')

# Check data types after conversion
print(soil_data.dtypes)

soilMoisture         int64
soil_pHSensor      float64
soilTemperature    float64
soil_GasValue        int64
status               int64
dtype: object


In [12]:
#Decision Tree Classifier Building in Scikit-learn

In [13]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [14]:
# Display first few rows of the DataFrame
print(soil_data.head())

# Get summary information about the DataFrame
print(soil_data.info())

# Get summary statistics of numerical columns
print(soil_data.describe())

   soilMoisture  soil_pHSensor  soilTemperature  soil_GasValue  status
0            36           4.76            28.44              5       0
1            36           4.13            28.44              5       0
2            37           3.75            28.44              5       0
3            38           3.37            28.44              5       0
4            38           3.23            28.44              5       0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 252 entries, 0 to 251
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   soilMoisture     252 non-null    int64  
 1   soil_pHSensor    252 non-null    float64
 2   soilTemperature  252 non-null    float64
 3   soil_GasValue    252 non-null    int64  
 4   status           252 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 11.8 KB
None
       soilMoisture  soil_pHSensor  soilTemperature  soil_GasValue      status
count    252.

In [15]:
# Check for missing values
print(soil_data.isnull().sum())

# Drop rows with missing values
soil_data.dropna(inplace=True)

soilMoisture       0
soil_pHSensor      0
soilTemperature    0
soil_GasValue      0
status             0
dtype: int64


In [16]:
print(soil_data.describe())

       soilMoisture  soil_pHSensor  soilTemperature  soil_GasValue      status
count    252.000000     252.000000       252.000000     252.000000  252.000000
mean      40.523810       3.632579        29.051667       8.396825    0.365079
std       11.975549       2.286744         0.380834       7.372912    0.482411
min        3.000000       0.010000        28.380000       0.000000    0.000000
25%       29.750000       1.737500        28.500000       5.750000    0.000000
50%       45.000000       3.530000        29.250000       6.000000    0.000000
75%       48.000000       5.627500        29.310000       7.000000    1.000000
max       89.000000       7.150000        29.500000      37.000000    1.000000


In [17]:
#split dataset in features and target variable
feature_cols = ['soilMoisture', 'soil_pHSensor', 'soilTemperature', 'soil_GasValue']
X = soil_data[feature_cols] # Features
y = soil_data.status # Target variable

In [18]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [19]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [20]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9342105263157895


In [21]:
pip install pydotplus

Note: you may need to restart the kernel to use updated packages.


In [22]:
!pip install graphviz



In [23]:
import os

# Replace 'path_to_graphviz' with the actual Graphviz executable path
os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\bin'

In [35]:
from sklearn.tree import export_graphviz
from io import StringIO
from IPython.display import Image
import pydotplus

# Your code to create dot_data and define feature_cols goes here...

# Create the graph
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

# Set the path to Graphviz executable (replace 'path_to_graphviz' with the correct path)
# For example, on Windows:
graph.set_graphviz_executables({'dot': r'C:\Program Files\Graphviz\bin\dot.exe'})

# Save the graph as an image
graph.write_png('Soil_Contamination_Data.png')
Image(graph.create_png())

NameError: name 'dot_data' is not defined

In [25]:
from IPython.display import FileLink

# Assuming your DataFrame is named 'cleaned_data' and you want to save it as 'cleaned_data.xlsx'
soil_data.to_excel('cleaned_data.xlsx', index=False)  # Save DataFrame to Excel

# Create a download link for the Excel file
FileLink('cleaned_data.xlsx')

In [26]:
# Example new data (replace this with your actual new data)
X_new = [
    [27,10,30,0]
] #Example of Bausite Value

# Predict on the new data
predictions = clf.predict(X_new)
print(predictions)  # Display the predicted labels for the new data


[1]




In [27]:
# Example new data (replace this with your actual new data)
X_new = [
    [50,4,30,30]
] #Example of Suspected Gas Contamination Data

# Predict on the new data
predictions = clf.predict(X_new)
print(predictions)  # Display the predicted labels for the new data

[1]




In [28]:
X_new = [
    [60,5,33,6]
] #Example of Standard Palm Oil Plantation

# Predict on the new data
predictions = clf.predict(X_new)
print(predictions)  # Display the predicted labels for the new data

[0]




In [29]:
# Example new data (replace this with your actual new data)
X_new = [
    [26,5,30,6]
] #Example of suspected contamination

# Predict on the new data
predictions = clf.predict(X_new)
print(predictions)  # Display the predicted labels for the new data

[1]




In [31]:
X_new = [
    [60,5,33,6]
] #Example of Standard Palm Oil Plantation

# Predict on the new data
predictions = clf.predict(X_new)
print(predictions)  # Display the predicted labels for the new data

[0]


