In [None]:
#Dowload the data

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the data
rice = pd.read_csv('/kaggle/input/rice-type-classification/riceClassification.csv')
rice.drop('id',axis = 1, inplace = True)
rice.head()

In [None]:
rice['Class'].value_counts() # Binary

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(rice, rice['Class']):
    print(train_index, test_index)
    strat_train_set = rice.iloc[train_index]
    strat_test_set = rice.iloc[test_index]

In [None]:
# Scaling data
from sklearn.preprocessing import StandardScaler
strat_train_set_predictors = strat_train_set.drop('Class', axis = 1)
strat_train_set_labels  = strat_train_set['Class'].copy()

sc = StandardScaler()
train_prepared = sc.fit_transform(strat_train_set_predictors)
train_prepared

In [None]:
# Evaluating different models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

forest_clf = RandomForestClassifier()
sgd_clf = SGDClassifier()
svc_clf = SVC()
kn_clf = KNeighborsClassifier()
nb_clf = GaussianNB()
lg_clf = LogisticRegression()

In [None]:
from sklearn.model_selection import cross_val_score

models = ['lg_clf', 'nb_clf','forest_clf','sgd_clf','svc_clf','kn_clf']
dic_models_scores = {}

for model in models:
    scores = cross_val_score(eval(model), train_prepared , strat_train_set_labels,
                             cv = 5, scoring = "accuracy")

    dic_models_scores[model] = list(scores)
    dic_models_scores[f'{model}_means'] = scores.mean()
    dic_models_scores[f'{model}_std'] = scores.std()

In [None]:
dic_models_scores

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,5))
plt.title('Precisión de los modelos')
plt.grid(axis = 'y')
plt.yticks([0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0])
sns.barplot(models, [dic_models_scores['lg_clf_means'], dic_models_scores['nb_clf_means'], dic_models_scores['forest_clf_means'],
                 dic_models_scores['sgd_clf_means'], dic_models_scores['svc_clf_means'], dic_models_scores['kn_clf_means']])

In [None]:
plt.figure(figsize=(10,5))
for model in models:
    plt.plot(range(1,6), dic_models_scores[model], label = model)
    
plt.title('5 cross-validation with 6 models')
plt.legend()
plt.show()

In [None]:
# Let's look random forest
# Fine-tune random forest
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators' : [10,100,200],  
                'criterion' : ['gini', 'entropy'],
                'max_features' : ['auto', 'sqrt', 'log2']}]
                

grid_search = GridSearchCV(forest_clf, param_grid, cv = 5, scoring = "accuracy")
grid_search.fit(train_prepared , strat_train_set_labels)

In [None]:
# Final model
forest_final = grid_search.best_estimator_
forest_final

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(forest_final, train_prepared,
                                     strat_train_set_labels, cv = 3)
 

from sklearn.metrics import confusion_matrix
cf = confusion_matrix(strat_train_set_labels, y_train_pred)
sns.heatmap(cf, annot = True, fmt='.4g')

In [None]:
# Precission, recall, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
p = precision_score(strat_train_set_labels, y_train_pred)
r = recall_score(strat_train_set_labels, y_train_pred)
f = f1_score(strat_train_set_labels, y_train_pred)
p,r,f

In [None]:
# Test set 
test_predictors = strat_test_set.drop('Class', axis = 1)
test_predictors = sc.transform(test_predictors)
test_labels = strat_test_set['Class'].copy()
final_predictions = forest_final.predict(test_predictors)
p = precision_score(test_labels, final_predictions)
r = recall_score(test_labels, final_predictions)
f = f1_score(test_labels, final_predictions)
p,r,f

In [None]:
# Prediction proba and random predictions
some_predictors = test_predictors[10:20]
some_predictions = forest_final.predict(some_predictors)
some_predictions

In [None]:
test_labels[10:20]

In [None]:
forest_final.predict_proba(some_predictors)