In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("train.csv",index_col=0)

In [None]:
df.head()

In [3]:
from sklearn.preprocessing import LabelEncoder
fert_encoder = LabelEncoder()
soil_encoder = LabelEncoder()
crop_encoder = LabelEncoder()
encoded_df = df.copy()
encoded_df['Fertilizer Name'] = fert_encoder.fit_transform(encoded_df['Fertilizer Name'])
encoded_df['Soil Type'] = soil_encoder.fit_transform(encoded_df['Soil Type'])
encoded_df['Crop Type'] = crop_encoder.fit_transform(encoded_df['Crop Type'])
X = encoded_df.drop(['Fertilizer Name'], axis=1)
y = encoded_df['Fertilizer Name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
df.head()

Unnamed: 0_level_0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,35,58,43,Red,Paddy,37,2,16,DAP


In [12]:
encoded_df.head()

Unnamed: 0_level_0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37,70,36,1,8,36,4,5,4
1,27,69,65,4,4,30,6,18,4
2,29,63,32,4,4,24,12,16,2
3,35,62,54,4,0,39,12,4,0
4,35,58,43,3,6,37,2,16,5


In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_leaf_nodes=14),
    n_estimators=300,
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy of AdaBoost Classifier:", accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=fert_encoder.classes_, yticklabels=fert_encoder.classes_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy of Random Forest Classifier:", accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=fert_encoder.classes_, yticklabels=fert_encoder.classes_)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_y_pred = dummy_clf.predict(X)
print("Accuracy of Dummy Classifier:", accuracy_score(y, dummy_y_pred))
# Plotting the confusion matrix for the dummy classifier
dummy_cm = confusion_matrix(y, dummy_y_pred)
sns.heatmap(dummy_cm, annot=True, fmt='d', cmap='Blues', xticklabels=fert_encoder.classes_, yticklabels=fert_encoder.classes_)
plt.figure(figsize=(8, 6))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
SGD_clf = make_pipeline(StandardScaler(),SGDClassifier(loss='log_loss'))
SGD_clf.fit(X_train, y_train)
y_pred = SGD_clf.predict(X_test)
print("Accuracy of SGD Classifier:", accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=fert_encoder.classes_, yticklabels=fert_encoder.classes_)

##USE GridSearchCV to find alpha


In [None]:
prob_predictions = SGD_clf.predict_proba(X)
top_3_indices = np.flip(np.argsort(prob_predictions, axis=1)[:, -3:],axis=1)
top_3_indices
accurate_counts=0
for i, val in enumerate(y):
    if val in top_3_indices[i]:
        accurate_counts += 1
print(accurate_counts/len(y))

In [None]:
print(y_pred)
print(top_3_indices)

In [4]:
from sklearn.metrics import make_scorer
def in_top_three_scoring(y_true, y_proba):
    top_3_indices = np.flip(np.argsort(y_proba, axis=1)[:, -3:], axis=1)
    accurate_counts = 0
    for i, val in enumerate(y_true):
        if val in top_3_indices[i]:
            accurate_counts += 1
    return accurate_counts / len(y_true)
top_three_scorer = lambda estimator, X, y : in_top_three_scoring(y, estimator.predict_proba(X))


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
param_grid = {
    'sgdclassifier__alpha': 10.0**-np.arange(1,7),
    'sgdclassifier__max_iter': [1000, 2000, 3000]
}
SGD_clf = make_pipeline(StandardScaler(),SGDClassifier(loss='log_loss', alpha=.01,max_iter=2000))
grid_search = GridSearchCV(SGD_clf, param_grid, cv=5, scoring=top_three_scorer, n_jobs=-1)
grid_search.fit(X, y)
print("Best parameters found: ", grid_search.best_params_)


KeyboardInterrupt: 

In [75]:
SGD_clf_ada = make_pipeline(StandardScaler(),AdaBoostClassifier(estimator=SGDClassifier(loss='log_loss', alpha=.01,max_iter=2000,verbose=True),n_estimators=500))
SGD_clf_ada.fit(X,y)




-- Epoch 1
Norm: 0.00, NNZs: 8, Bias: -0.000463, T: 750000, Avg. loss: 0.693002
Total training time: 0.31 seconds.
-- Epoch 2
Norm: 0.00, NNZs: 8, Bias: -0.000495, T: 1500000, Avg. loss: 0.692980
Total training time: 0.54 seconds.
-- Epoch 3
Norm: 0.00, NNZs: 8, Bias: -0.000514, T: 2250000, Avg. loss: 0.692971
Total training time: 0.83 seconds.
-- Epoch 4
Norm: 0.00, NNZs: 8, Bias: -0.000527, T: 3000000, Avg. loss: 0.692966
Total training time: 1.05 seconds.
-- Epoch 5
Norm: 0.00, NNZs: 8, Bias: -0.000538, T: 3750000, Avg. loss: 0.692962
Total training time: 1.32 seconds.
-- Epoch 6
Norm: 0.00, NNZs: 8, Bias: -0.000546, T: 4500000, Avg. loss: 0.692958
Total training time: 1.57 seconds.
Convergence after 6 epochs took 1.57 seconds
-- Epoch 1
Norm: 0.00, NNZs: 8, Bias: -0.000475, T: 750000, Avg. loss: 0.692998
Total training time: 0.26 seconds.
-- Epoch 2
Norm: 0.00, NNZs: 8, Bias: -0.000507, T: 1500000, Avg. loss: 0.692976
Total training time: 0.50 seconds.
-- Epoch 3
Norm: 0.00, NNZs: 

KeyboardInterrupt: 

In [None]:
def get_predict_score(estimator,X,y,

In [76]:
SGD_clf = make_pipeline(StandardScaler(),SGDClassifier(loss='log_loss', alpha=.01,max_iter=2000))
SGD_clf.fit(X_train,y_train)

In [77]:
y_pred = SGD_clf.predict_proba(X_test)
in_top_three_scoring(y_test,y_pred)

0.46046

In [53]:
sample = pd.read_csv("sample_submission.csv",index_col=0)
test = pd.read_csv("test.csv",index_col=0)
sample

Unnamed: 0_level_0,Fertilizer Name
id,Unnamed: 1_level_1
750000,14-35-14 10-26-26 Urea
750001,14-35-14 10-26-26 Urea
750002,14-35-14 10-26-26 Urea
750003,14-35-14 10-26-26 Urea
750004,14-35-14 10-26-26 Urea
...,...
999995,14-35-14 10-26-26 Urea
999996,14-35-14 10-26-26 Urea
999997,14-35-14 10-26-26 Urea
999998,14-35-14 10-26-26 Urea


In [54]:
test['Soil Type'] = soil_encoder.transform(test['Soil Type'])
test['Crop Type'] = crop_encoder.transform(test['Crop Type'])
test

Unnamed: 0_level_0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
750000,31,70,52,4,10,34,11,24
750001,27,62,45,3,8,30,14,15
750002,28,72,28,1,2,14,15,4
750003,37,53,57,0,2,18,17,36
750004,31,55,32,3,7,13,19,14
...,...,...,...,...,...,...,...,...
999995,26,66,30,3,8,14,7,18
999996,33,62,55,3,7,28,14,7
999997,36,53,64,0,6,28,11,27
999998,36,67,26,1,6,33,0,10


In [11]:
y_test_pred = SGD_clf.predict_proba(test)

In [13]:
def mapper(x):
    return fert_encoder.classes_[x]
vectorized_mapper = np.vectorize(mapper)

In [40]:
y_test_pred
y_test_pred_top3 = np.flip(np.argsort(y_test_pred, axis=1)[:, -3:], axis=1)
y_test_pred_top3
y_test_pred_strings = vectorized_mapper(y_test_pred_top3)



In [46]:
aaa = np.apply_along_axis(" ".join, arr=y_test_pred_strings, axis=1)
aaa

array(['20-20 10-26-26 28-28', '10-26-26 17-17-17 20',
       '17-17-17 10-26-26 20', ..., '14-35-14 10-26-26 20',
       '17-17-17 10-26-26 14', '14-35-14 17-17-17 10'], dtype='<U20')

In [67]:
sample['Fertilizer Name']= aaa

In [64]:

vectorized_mapper(np.flip(np.argsort(SGD_clf.predict_proba(test.iloc[:2]), axis=1)[:, -3:], axis=1))

array([['20-20', '10-26-26', '28-28'],
       ['10-26-26', '17-17-17', '20-20']], dtype='<U8')

In [69]:
sample.to_csv("attempt1.csv")