In [14]:
import pandas as pd

# Load your combined file
df = pd.read_csv('/content/Merged_Soil_Data.csv')

# Quick look
df.head()


Unnamed: 0.1,Unnamed: 0,DISTRICT,BLOCK,PANCHAYAT,VILLAGE,LATITUDE,LONGITUDE,PH,P,K,CA,MG
0,0,Kozhikode,Chelannur,Chelannur,Chelannur,11.354,75.779,6.3,3.36,262.86,442.65,57.45
1,1,Kozhikode,Chelannur,Chelannur,Chelannur,11.352,75.78,6.3,2.24,264.43,373.75,28.4
2,2,Kozhikode,Chelannur,Chelannur,Chelannur,11.358,75.78,5.9,4.48,171.36,182.95,21.81
3,3,Kozhikode,Chelannur,Chelannur,Chelannur,11.362,75.779,6.3,4.48,337.12,274.25,31.9
4,4,Kozhikode,Chelannur,Chelannur,Chelannur,11.364,75.781,7.2,12.32,972.94,213.25,41.5


In [15]:
# Verify shape and summary
print("Cleaned data shape:", df.shape)
print("\nSummary statistics:\n", df[['P', 'K', 'CA', 'MG', 'PH']].describe())

Cleaned data shape: (6830, 12)

Summary statistics:
                  P            K           MG           PH
count  6830.000000  6830.000000  6830.000000  6830.000000
mean     48.968876   317.412140   168.437465     5.978571
std      45.127264   195.896644   118.971234     0.801064
min       1.120000    10.860000     0.300000     0.200000
25%      18.020000   185.597500    82.787500     5.400000
50%      32.480000   269.975000   138.520000     6.000000
75%      62.540000   398.215000   239.580000     6.500000
max     313.600000  1200.000000   500.000000     8.000000


In [16]:
# Check how many missing values each column has
df.isnull().sum()


Unnamed: 0,0
Unnamed: 0,0
DISTRICT,0
BLOCK,0
PANCHAYAT,0
VILLAGE,0
LATITUDE,0
LONGITUDE,0
PH,0
P,0
K,0


# removes unrealistic nutrient readings that are too far from the typical range.

In [17]:
import numpy as np

def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

# Convert nutrient columns to numeric, coercing errors
for col in ['P', 'K', 'CA', 'MG', 'PH']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN values introduced by coercion
df.dropna(subset=['P', 'K', 'CA', 'MG', 'PH'], inplace=True)

# Apply to nutrient columns
for col in ['P', 'K', 'CA', 'MG', 'PH']:
    df = remove_outliers_iqr(df, col)

# Check Cleaned Data

In [18]:
# Verify shape and summary
print("Cleaned data shape:", df.shape)
print("\nSummary statistics:\n", df[['P', 'K', 'CA', 'MG', 'PH']].describe())


Cleaned data shape: (5783, 12)

Summary statistics:
                  P            K           CA           MG           PH
count  5783.000000  5783.000000  5783.000000  5783.000000  5783.000000
mean     37.579664   280.746873   470.878020   151.845830     5.967771
std      28.502079   138.039556   289.049984   104.531649     0.777158
min       1.120000    10.860000     1.520000     0.300000     3.800000
25%      16.880000   180.825000   243.950000    80.075000     5.400000
50%      28.000000   254.580000   416.350000   127.150000     6.000000
75%      50.400000   364.220000   668.375000   218.300000     6.500000
max     129.120000   702.350000  1312.400000   447.000000     8.000000


In [19]:
output_path = '/content/Cleaned_Soil_Data.csv'
df.to_csv(output_path, index=False)
print("✅ Cleaned dataset saved at:", output_path)

✅ Cleaned dataset saved at: /content/Cleaned_Soil_Data.csv
