In [2]:
import numpy as np
import pandas as pd

In [3]:
# load data

data = pd.read_csv('data/Crop_recommendation.csv', sep=',')
print(data)

        N   P   K  temperature   humidity        ph    rainfall   label
0      90  42  43    20.879744  82.002744  6.502985  202.935536    rice
1      85  58  41    21.770462  80.319644  7.038096  226.655537    rice
2      60  55  44    23.004459  82.320763  7.840207  263.964248    rice
3      74  35  40    26.491096  80.158363  6.980401  242.864034    rice
4      78  42  42    20.130175  81.604873  7.628473  262.717340    rice
...   ...  ..  ..          ...        ...       ...         ...     ...
2195  107  34  32    26.774637  66.413269  6.780064  177.774507  coffee
2196   99  15  27    27.417112  56.636362  6.086922  127.924610  coffee
2197  118  33  30    24.131797  67.225123  6.362608  173.322839  coffee
2198  117  32  34    26.272418  52.127394  6.758793  127.175293  coffee
2199  104  18  30    23.603016  60.396475  6.779833  140.937041  coffee

[2200 rows x 8 columns]


In [4]:
# replace the value of ? to NaN
data.replace("?", np.NaN, inplace = True)
print(data.isna().sum())

# drop the row which has NaN value
data.dropna(inplace = True)

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64


In [5]:
# encoding labels

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(data['label'])
data['label'] = encoder.transform(data['label'])


def decode_data(encoded_list):

    dd = encoder.inverse_transform(encoded_list)
    
    return dd

In [6]:
### test
data['label'].value_counts()

21    100
19    100
2     100
4     100
6     100
8     100
10    100
12    100
14    100
16    100
18    100
20    100
1     100
3     100
5     100
7     100
9     100
11    100
13    100
15    100
17    100
0     100
Name: label, dtype: int64

In [7]:
# scaling data

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])

data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.transform(data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])

data

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,0.837945,-0.225,0.379310,-0.814696,0.051525,0.081874,1.809703,20
1,0.758893,0.175,0.310345,-0.660920,-0.005171,0.643995,2.206918,20
2,0.363636,0.100,0.413793,-0.447878,0.062237,1.486593,2.831689,20
3,0.584980,-0.400,0.275862,0.154068,-0.010603,0.583388,2.478345,20
4,0.648221,-0.225,0.344828,-0.944105,0.038122,1.264171,2.810808,20
...,...,...,...,...,...,...,...,...
2195,1.106719,-0.425,0.000000,0.203019,-0.473607,0.372938,1.388357,5
2196,0.980237,-0.900,-0.172414,0.313938,-0.802942,-0.355190,0.553572,5
2197,1.280632,-0.450,-0.068966,-0.253250,-0.446259,-0.065589,1.313810,5
2198,1.264822,-0.475,0.068966,0.116314,-0.954826,0.350593,0.541024,5


In [8]:
# make feature table

feature_table = pd.DataFrame()

for i in range(22):
    represent_feature = data[data['label']==i].mean().transpose()
    feature_table = feature_table.append(represent_feature, ignore_index = True)

feature_table

Unnamed: 0,K,N,P,humidity,label,ph,rainfall,temperature
0,5.78931,-0.256126,2.0805,0.399512,0.0,-0.520387,0.297863,-0.512363
1,0.622414,0.999684,0.77525,-0.003875,1.0,-0.463419,0.16343,0.306978
2,-0.44,0.047747,0.41175,-0.517224,2.0,0.744689,-0.451865,0.755255
3,1.652414,0.048854,0.41975,-2.142793,3.0,0.957941,-0.247985,-1.161174
4,-0.048621,-0.23747,-0.85175,0.484091,4.0,-0.471121,1.353394,0.312692
5,-0.071034,1.01502,-0.5565,-0.727707,5.0,0.3837,1.058324,-0.010051
6,-0.428966,1.276996,-0.119,-0.02121,6.0,0.512244,-0.242307,-0.277911
7,5.796897,-0.218498,2.03825,0.047229,7.0,-0.419254,-0.422933,-0.301974
8,0.275517,0.654545,-0.1035,-0.028069,8.0,0.323265,1.338425,-0.110547
9,-0.412069,-0.256917,0.4135,-1.982961,9.0,-0.709738,0.185079,-0.94671


In [9]:
# get cos similarity table

from sklearn.metrics.pairwise import cosine_similarity

feature_table_without_label = feature_table[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
cos = cosine_similarity(feature_table_without_label, feature_table_without_label)

In [10]:
# select top 5 in similarity

def similarity_top_5(similarity_table, crop):
    sort_indice = np.argsort(cos[crop])
    top_5_index = sort_indice[[-2, -3, -4, -5, -6]]
    
    return top_5_index

similarity_top_5(cos, 20)

array([ 8,  5,  4, 18, 17], dtype=int64)

In [11]:
# our recommendation function
# input : your location number

def recommend_content_based_filtering(where):
    check_preference = data.iloc[where]['label']
    check_preference = int(check_preference)
    
    recommend_top_5 = similarity_top_5(cos, check_preference)
    
    recommend_crops = decode_data(recommend_top_5)
    
    print(f"For location {where} : ")
    print(f"the best crop in this location : {decode_data([check_preference])}")
    print(f"our recommendation crops : {recommend_crops}")
    print("\n")

In [12]:
recommend_content_based_filtering(0)

recommend_content_based_filtering(500)

recommend_content_based_filtering(2176)

For location 0 : 
the best crop in this location : ['rice']
our recommendation crops : ['jute' 'coffee' 'coconut' 'pigeonpeas' 'papaya']


For location 500 : 
the best crop in this location : ['mothbeans']
our recommendation crops : ['blackgram' 'lentil' 'mungbean' 'mango' 'kidneybeans']


For location 2176 : 
the best crop in this location : ['coffee']
our recommendation crops : ['jute' 'rice' 'cotton' 'maize' 'coconut']


