# **Multiclass Prediction Model**

## **Loading Data**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,label,type
0,90,42.0,43,20.879744,82.002744,6.502985,rice,loamy
1,85,58.0,41,21.770462,80.319644,7.038096,rice,loamy
2,60,55.0,44,23.004459,82.320763,7.840207,rice,loamy
3,74,35.0,40,26.491096,80.158363,6.980401,rice,loamy
4,78,42.0,42,20.130175,81.604873,7.628473,rice,loamy


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080 entries, 0 to 3079
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            3080 non-null   int64  
 1   P            3080 non-null   float64
 2   K            3080 non-null   int64  
 3   temperature  3080 non-null   float64
 4   humidity     3080 non-null   float64
 5   ph           3080 non-null   float64
 6   label        3080 non-null   object 
 7   type         3080 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 192.6+ KB


### **Checking if the dataset has null values**

In [5]:
df.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
label          0
type           0
dtype: int64

## **Data Splitting**

In [6]:
y = df[['label', 'type']]
x = df.drop(['label', 'type'], axis = 1)



In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

y_train

Unnamed: 0,label,type
2318,jute,Alluvial
2183,coffee,Laterite
2689,jute,Alluvial
1205,grapes,well drained loamy
1527,apple,loamy
...,...,...
1930,cotton,sandy loam
79,rice,loamy
1859,coconut,sandy loam
2840,jute,Alluvial


In [8]:
x_train

Unnamed: 0,N,P,K,temperature,humidity,ph
2318,119,8.3,444,18.488000,69.650000,7.600000
2183,93,26.0,27,24.592457,56.468296,7.288212
2689,333,6.8,422,20.757000,69.742000,7.720000
1205,2,123.0,198,39.648519,82.210799,6.253035
1527,25,143.0,198,22.812125,91.518617,6.027314
...,...,...,...,...,...,...
1930,118,45.0,23,23.370444,77.431989,7.977651
79,81,41.0,38,22.678461,83.728744,7.524080
1859,37,10.0,32,28.963183,95.163337,6.165085
2840,182,9.2,465,18.555000,69.524000,7.400000


## **K-Nearest Neighbors**

### **Model Building**

In [8]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=4)
knn_multi_classifier = MultiOutputClassifier(knn, n_jobs=-1)
knn_multi_classifier.fit(x_train,y_train)
y_knn_multi_pred = knn_multi_classifier.predict(x_test)

y_knn_multi_pred

array([['cotton', 'sandy loam'],
       ['orange', 'loamy'],
       ['chickpea', 'sandy loam'],
       ...,
       ['kidneybeans', 'sandy loam'],
       ['watermelon', 'well drained loamy '],
       ['mungbean', 'loamy']], dtype=object)

### **Model Evaluation**

In [20]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_knn_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_knn = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(KNN):", average_accuracy_knn)


Average Accuracy(KNN): 0.9464285714285714


## **Random Forest**

### **Model Building**

In [10]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_multi_classifier = MultiOutputClassifier(rf, n_jobs=-1)
rf_multi_classifier.fit(x_train, y_train)
y_rf_multi_pred = rf_multi_classifier.predict(x_test)

print("Predictions (Random Forest):", y_rf_multi_pred)

Predictions (Random Forest): [['cotton' 'sandy loam']
 ['orange' 'loamy']
 ['chickpea' 'sandy loam']
 ...
 ['kidneybeans' 'sandy loam']
 ['watermelon' 'well drained loamy ']
 ['mungbean' 'loamy']]


### **Model Evaluation**

In [21]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_rf_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_rf = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(Random Forest):", average_accuracy_rf)


Average Accuracy(Random Forest): 0.9821428571428572


## **Support Vectore Machines**

### **Model Building**

In [13]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=42)
svm_multi_classifier = MultiOutputClassifier(svm)
svm_multi_classifier.fit(x_train, y_train)
y_svm_multi_pred = svm_multi_classifier.predict(x_test)

print("Predictions (SVM):", y_svm_multi_pred)

Predictions (SVM): [['cotton' 'Laterite ']
 ['orange' 'loamy']
 ['chickpea' 'sandy loam']
 ...
 ['kidneybeans' 'sandy loam']
 ['watermelon' 'loamy']
 ['mungbean' 'loamy']]


### **Model Evaluation**

In [22]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_svm_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_svm = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(SVM):", average_accuracy_svm)


Average Accuracy(SVM): 0.8628246753246753


## **Decision Tree**

### **Model Building**

In [23]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_multi_classifier = MultiOutputClassifier(dt)
dt_multi_classifier.fit(x_train, y_train)
y_dt_multi_pred = dt_multi_classifier.predict(x_test)

print("Predictions (Decision Tree):", y_dt_multi_pred)

Predictions (Decision Tree): [['cotton' 'sandy loam']
 ['orange' 'loamy']
 ['chickpea' 'sandy loam']
 ...
 ['chickpea' 'sandy loam']
 ['watermelon' 'well drained loamy ']
 ['orange' 'loamy']]


### **Model Evaluation**

In [31]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_dt_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_dt = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(Decision Tree):", average_accuracy_dt)


Average Accuracy(Decision Tree): 0.7378246753246753


## **Gradient Boosting**

### **Model Building**

In [9]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
gb_multi_classifier = MultiOutputClassifier(gb)
gb_multi_classifier.fit(x_train, y_train)
y_gb_multi_pred = gb_multi_classifier.predict(x_test)

print("Predictions (Gradient Boosting):", y_gb_multi_pred)

Predictions (Gradient Boosting): [['cotton' 'sandy loam']
 ['orange' 'loamy']
 ['chickpea' 'sandy loam']
 ...
 ['kidneybeans' 'sandy loam']
 ['watermelon' 'well drained loamy ']
 ['mungbean' 'loamy']]


### **Model Evaluation**

In [10]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_gb_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_gb = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(Gaussian NB):", average_accuracy_gb)


Average Accuracy(Gaussian NB): 0.9715909090909091


## **Gaussian Naive Bayes**

### **Model Building**

In [32]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_multi_classifier = MultiOutputClassifier(gnb)
gnb_multi_classifier.fit(x_train, y_train)
y_gnb_multi_pred = gnb_multi_classifier.predict(x_test)

print("Predictions (Gaussian Naive Bayes):", y_gnb_multi_pred)

Predictions (Gaussian Naive Bayes): [['cotton' 'loamy']
 ['orange' 'loamy']
 ['chickpea' 'sandy loam']
 ...
 ['kidneybeans' 'sandy loam']
 ['rice' 'loamy']
 ['mungbean' 'sandy loam']]


### **Model Evaluation**

In [34]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each target variable

accuracy_scores = []
for i in range(y_test.shape[1]):
    accuracy_y_i = accuracy_score(y_test.iloc[:, i], y_gnb_multi_pred[:, i])
    accuracy_scores.append(accuracy_y_i)

# Compute the average accuracy across all target variables
average_accuracy_gnb = sum(accuracy_scores) / len(accuracy_scores)

print("Average Accuracy(Gaussian Nb):", average_accuracy_gnb)


Average Accuracy(Gaussian Nb): 0.7646103896103896


## **Model Comparison**

In [35]:
accuracies = {
    'Classifier': ['KNN', 'Random Forest', 'SVM', 'Decision Tree', 'Gradient Boosting', 'Gaussian Naive Bayes'],
    'Average Accuracy': [average_accuracy_knn, average_accuracy_rf, average_accuracy_svm,
                         average_accuracy_dt, average_accuracy_gb, average_accuracy_gnb]
}
accuracy_table = pd.DataFrame(accuracies)

accuracy_table

Unnamed: 0,Classifier,Average Accuracy
0,KNN,0.946429
1,Random Forest,0.982143
2,SVM,0.862825
3,Decision Tree,0.737825
4,Gradient Boosting,0.971591
5,Gaussian Naive Bayes,0.76461


## **Top 3 MultiClassification Prediction**

In [11]:
sample_array = np.array([[10, 40, 30, 25, 65, 6.8]])

probabilities = gb_multi_classifier.predict_proba(sample_array)
prediction = gb_multi_classifier.predict(sample_array)

probabilities_scaled = [[prob * 100 for prob in target_probs[0]] for target_probs in probabilities]

top3_indices_per_target_variable = [np.argsort(target_probs)[0][-3:][::-1] for target_probs in probabilities]

for i, (top3_indices, probs) in enumerate(zip(top3_indices_per_target_variable, probabilities_scaled)):
    print("Top 3 Predictions for Target variable {}:".format(i+1))
    for j, index in enumerate(top3_indices):
        if index < len(gb_multi_classifier.classes_[i]):
            class_name = gb_multi_classifier.classes_[i][index]
            print("{}. Class: {}, Probability: {:.4f}".format(j+1, class_name, probs[index]))
        else:
            print("{}. Class: Out of range, Probability: {:.4f}".format(j+1, probs[index]))
    print("------------------------------------------")

# Overall prediction
print("Overall Prediction:", prediction[0])

Top 3 Predictions for Target variable 1:
1. Class: lentil, Probability: 67.4102
2. Class: jute, Probability: 25.8292
3. Class: pigeonpeas, Probability: 2.3512
------------------------------------------
Top 3 Predictions for Target variable 2:
1. Class: loamy, Probability: 52.2137
2. Class: sandy loam, Probability: 47.5685
3. Class: Alluvial, Probability: 0.1609
------------------------------------------
Overall Prediction: ['lentil' 'loamy']




## **Export the model into a seperate file**

In [17]:
import joblib
import pickle

file_name = 'model.pkl'
joblib.dump(gb_multi_classifier, file_name)

['model.pkl']