In [27]:
import numpy as np
import pandas as pd

In [28]:
df = pd.read_csv("data/raw-data-crop-recommendation.csv")

# list of crops to be retained
selected_crops = [
    "rice", "maize", "kidneybeans", "lentil", "grapes", 
    "watermelon", "muskmelon", "cotton", "jute", "coffee"
]

# filter the dataframe
df = df[df['label'].isin(selected_crops)]

In [29]:
# define the bins and labels
bins = [0.0, 0.1, 1.0, 11.0, 31.0, 71.0, 151.0, float('inf')]
labels = ['No Rain', 'Very Light Rain', 'Light Rain', 'Moderate Rain', 'Heavy Rain', 'Very Heavy Rain', 'Extremely Heavy Rain']

# convert the numerical rainfall into its categorical value
df['rainfall'] = pd.cut(df['rainfall'], bins=bins, labels=labels)

In [30]:
# rename vague columns
df = df.rename(columns={
    "N": "Nitrogen Content",
    "P": "Phosphorous Content",
    "K": "Potassium Content"
})

# obtain numerical data columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# normalize the values
df[numerical_columns] = (df[numerical_columns] - df[numerical_columns].min()) / (df[numerical_columns].max() - df[numerical_columns].min())

In [26]:
# save final dataframe to an external csv file
df.to_csv('data/processed-data.csv', index = False)
print('processed data saved to data/processed-data.csv')