In [47]:
import numpy as np
import pandas as pd

In [48]:
df = pd.read_csv("data/raw-data-crop-recommendation.csv")

# list of crops to be retained
selected_crops = [
    "rice", "maize", "kidneybeans", "lentil", "grapes", 
    "watermelon", "muskmelon", "cotton", "jute", "coffee"
]

# filter the dataframe
df = df[df['label'].isin(selected_crops)]

In [49]:
# define the bins and labels
ph_bins = [0.0, 3.5, 4.5, 5.1, 5.6, 6.1, 6.6, 7.4, 7.9, 8.5, 9.0, 14.0]
ph_labels = ['Ultra Acid', 'Extremely Acid', 'Very Strongly Acid', 'Strongly Acid', 'Moderately Acid', 'Slightly Acid', 'Neutral', 'Slightly Alkaline', 'Moderately Alkaline', 'Strongly Alkaline', 'Very Strongly Alkaline']

rainfall_bins = [0.0, 0.1, 1.0, 11.0, 31.0, 71.0, 151.0, float('inf')]
rainfall_labels = ['No Rain', 'Very Light Rain', 'Light Rain', 'Moderate Rain', 'Heavy Rain', 'Very Heavy Rain', 'Extremely Heavy Rain']

# convert the numerical ph into its categorical value
df['ph'] = pd.cut(df['ph'], bins=ph_bins, labels=ph_labels)

# convert the numerical rainfall into its categorical value
df['rainfall'] = pd.cut(df['rainfall'], bins=rainfall_bins, labels=rainfall_labels)

In [50]:
# rename vague columns
df = df.rename(columns={
    "N": "Nitrogen Content",
    "P": "Phosphorous Content",
    "K": "Potassium Content"
})

# obtain numerical data columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# normalize the values
df[numerical_columns] = (df[numerical_columns] - df[numerical_columns].min()) / (df[numerical_columns].max() - df[numerical_columns].min())

In [51]:
# save final dataframe to an external csv file
df.to_csv('data/processed-data.csv', index = False)
print('processed data saved to data/processed-data.csv')

processed data saved to data/processed-data.csv
