In [1]:
# libraries for numerical
import pandas as pd  
import numpy as np

# libraries for visualization
import matplotlib.pyplot as plt  
import seaborn as sns

# libraries for machine learning
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

# to plot the diagrams within the cells
%matplotlib inline

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/sukhna_dhanas/train_set_label.csv" )
data.head()

Unnamed: 0,Solidity,Eccentricity,EquivDiameter,Extrema,FilledArea,Extent,Orientation,EulerNumber,BoundingBox1,BoundingBox2,...,ConvexHull3,ConvexHull4,MajorAxisLength,MinorAxisLength,Perimeter,ConvexArea,Centroid1,Centroid2,Area,microorganism
0,0.711636,0.673498,0.109069,0.870544,0.010808,0.660599,0.094353,0.987915,0.870476,0.447276,...,0.87746,0.871746,0.025669,0.028256,0.010776,0.001776,0.878873,0.453973,0.020195,1
1,0.184271,0.865533,0.181675,0.306442,0.021235,0.183804,0.484926,0.974488,0.284444,0.345343,...,0.295238,0.290794,0.128062,0.077815,0.044747,0.016309,0.325508,0.360384,0.045702,2
2,0.431175,0.679469,0.172644,0.750469,0.020929,0.38094,0.891717,0.946626,0.707302,0.227592,...,0.729524,0.72381,0.065495,0.062696,0.049242,0.0064,0.739531,0.25586,0.042004,2
3,0.712849,0.991839,0.240241,0.27142,0.036976,0.700643,0.016835,0.975159,0.268571,0.468366,...,0.269841,0.268571,0.146561,0.020286,0.035455,0.006678,0.267614,0.568813,0.073303,4
4,0.338077,0.996782,0.123578,0.045654,0.011389,0.088682,0.21915,0.982544,0.041905,0.630931,...,0.041905,0.041905,0.134704,0.011038,0.034491,0.004833,0.071762,0.699979,0.024521,2


In [3]:
data.isnull().sum()

Solidity           0
Eccentricity       0
EquivDiameter      0
Extrema            0
FilledArea         0
Extent             0
Orientation        0
EulerNumber        0
BoundingBox1       0
BoundingBox2       0
BoundingBox3       0
BoundingBox4       0
ConvexHull1        0
ConvexHull2        0
ConvexHull3        0
ConvexHull4        0
MajorAxisLength    0
MinorAxisLength    0
Perimeter          0
ConvexArea         0
Centroid1          0
Centroid2          0
Area               0
microorganism      0
dtype: int64

In [4]:
X = data.drop('microorganism', axis=1)
y = data['microorganism']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(criterion='gini', n_estimators=27)
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [10]:
model.score(X_test, y_test)

0.9916817359855334

In [11]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[661   4   7   1]
 [  0 719   1   0]
 [  7   3 673   0]
 [  0   0   0 689]]
0.9916817359855334
              precision    recall  f1-score   support

           1       0.99      0.98      0.99       673
           2       0.99      1.00      0.99       720
           3       0.99      0.99      0.99       683
           4       1.00      1.00      1.00       689

    accuracy                           0.99      2765
   macro avg       0.99      0.99      0.99      2765
weighted avg       0.99      0.99      0.99      2765



In [12]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=300,criterion='entropy',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

[[629  20  23   1]
 [  0 719   1   0]
 [  9  15 658   1]
 [  0   1   0 688]]
0.9743218806509946
              precision    recall  f1-score   support

           1       0.99      0.93      0.96       673
           2       0.95      1.00      0.97       720
           3       0.96      0.96      0.96       683
           4       1.00      1.00      1.00       689

    accuracy                           0.97      2765
   macro avg       0.98      0.97      0.97      2765
weighted avg       0.97      0.97      0.97      2765



### Randomized Search CV

In [13]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [14]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 36.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 67.7min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [15]:
 rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 120,
 'criterion': 'entropy'}

In [16]:
rf_randomcv

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [17]:
best_random_grid=rf_randomcv.best_estimator_

In [26]:
best_random_grid

RandomForestClassifier(criterion='entropy', max_depth=120, max_features='sqrt',
                       n_estimators=600)

In [18]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[662   3   7   1]
 [  0 719   1   0]
 [  5   3 675   0]
 [  0   0   0 689]]
Accuracy Score 0.9927667269439421
Classification report:               precision    recall  f1-score   support

           1       0.99      0.98      0.99       673
           2       0.99      1.00      1.00       720
           3       0.99      0.99      0.99       683
           4       1.00      1.00      1.00       689

    accuracy                           0.99      2765
   macro avg       0.99      0.99      0.99      2765
weighted avg       0.99      0.99      0.99      2765



In [21]:
best_random_grid.score(X_test, y_test)

0.9927667269439421

In [22]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/sukhna_dhanas/test_set_label.csv')
test_data

Unnamed: 0,Solidity,Eccentricity,EquivDiameter,Extrema,FilledArea,Extent,Orientation,EulerNumber,BoundingBox1,BoundingBox2,...,ConvexHull2,ConvexHull3,ConvexHull4,MajorAxisLength,MinorAxisLength,Perimeter,ConvexArea,Centroid1,Centroid2,Area
0,0.226508,0.919736,0.396523,0.821764,0.124955,0.182526,0.625360,0.880161,0.593651,0.130053,...,0.592030,0.593651,0.594286,0.246947,0.114904,0.258217,0.050894,0.735949,0.224618,0.177666
1,0.131249,0.839202,0.189765,0.779862,0.024801,0.102111,0.903063,0.930178,0.697143,0.148506,...,0.755218,0.698413,0.697143,0.168388,0.110807,0.086619,0.024327,0.782927,0.277804,0.049141
2,0.356525,0.950434,0.115916,0.711069,0.010689,0.218164,0.860724,0.969789,0.674286,0.324253,...,0.719165,0.679365,0.676190,0.077268,0.027995,0.023458,0.004147,0.683484,0.365322,0.022188
3,0.338935,0.828545,0.029171,0.450281,0.001548,0.256960,0.188719,0.995300,0.440635,0.533392,...,0.452245,0.451429,0.448889,0.020454,0.016625,0.010399,0.000735,0.439275,0.530411,0.003269
4,0.532809,0.747284,0.139494,0.813557,0.019737,0.485427,0.205856,0.973289,0.805078,0.815014,...,0.806157,0.806001,0.805078,0.042423,0.038444,0.019337,0.003800,0.825515,0.826488,0.031852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,0.577456,0.449211,0.085694,0.211382,0.008245,0.492650,0.093201,0.988251,0.198095,0.588752,...,0.208096,0.200635,0.200000,0.017503,0.026957,0.015475,0.001547,0.196821,0.589611,0.014035
3452,0.205350,0.918622,0.215054,0.367730,0.034283,0.161805,0.868846,0.966096,0.298413,0.378735,...,0.332068,0.328889,0.303492,0.141463,0.066615,0.107457,0.019348,0.336157,0.471557,0.060664
3453,0.607913,0.972458,0.213403,0.766104,0.030132,0.297875,0.184385,0.962068,0.762540,0.567663,...,0.760911,0.762540,0.762540,0.099135,0.026033,0.046843,0.006414,0.794550,0.612553,0.059877
3454,0.376952,0.978037,0.079065,0.685428,0.005677,0.166046,0.187657,0.990265,0.693333,0.492091,...,0.693232,0.695238,0.694603,0.071858,0.016113,0.018262,0.002228,0.705078,0.528020,0.012470


In [23]:
test_data.isnull().sum()

Solidity           0
Eccentricity       0
EquivDiameter      0
Extrema            0
FilledArea         0
Extent             0
Orientation        0
EulerNumber        0
BoundingBox1       0
BoundingBox2       0
BoundingBox3       0
BoundingBox4       0
ConvexHull1        0
ConvexHull2        0
ConvexHull3        0
ConvexHull4        0
MajorAxisLength    0
MinorAxisLength    0
Perimeter          0
ConvexArea         0
Centroid1          0
Centroid2          0
Area               0
dtype: int64

In [24]:
y_pred = best_random_grid.predict(test_data)
y_pred

array([3, 2, 2, ..., 4, 2, 1], dtype=int64)

In [25]:
res = pd.DataFrame(y_pred) #target is nothing but the final predictions of your model on input features of your new unseen test data
#res.index = test_data.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ["prediction"]
res.to_csv("submission_rfc.csv")      # the csv file will be saved locally on the same location where this notebook is located.

## GridSearch CV

In [27]:
rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 120,
 'criterion': 'entropy'}

In [28]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['entropy'], 'max_depth': [120], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [0, 1, 2, 3, 4], 'n_estimators': [400, 500, 600, 700, 800]}


In [29]:
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 57.1min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed: 74.9min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [120],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [0, 1, 2, 3, 4],
                         'n_estimators': [400, 500, 600, 700, 800]},
             verbose=2)

In [30]:
grid_search.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=120, max_features='sqrt',
                       n_estimators=600)

In [31]:
best_grid=grid_search.best_estimator_

In [32]:
best_grid

RandomForestClassifier(criterion='entropy', max_depth=120, max_features='sqrt',
                       n_estimators=600)

In [33]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[662   4   6   1]
 [  0 719   1   0]
 [  5   3 675   0]
 [  0   0   0 689]]
Accuracy Score 0.9927667269439421
Classification report:               precision    recall  f1-score   support

           1       0.99      0.98      0.99       673
           2       0.99      1.00      0.99       720
           3       0.99      0.99      0.99       683
           4       1.00      1.00      1.00       689

    accuracy                           0.99      2765
   macro avg       0.99      0.99      0.99      2765
weighted avg       0.99      0.99      0.99      2765



In [35]:
best_grid.score(X_test, y_test)

0.9927667269439421

In [36]:
y_pred = best_grid.predict(test_data)
y_pred

array([3, 2, 2, ..., 4, 2, 1], dtype=int64)

In [37]:
res_gscv = pd.DataFrame(y_pred) #target is nothing but the final predictions of your model on input features of your new unseen test data
#res.index = test_data.index # its important for comparison. Here "test_new" is your new test dataset
res_gscv.columns = ["prediction"]
res_gscv.to_csv("submission_rfc_gscv.csv")      # the csv file will be saved locally on the same location where this notebook is located.

### Using Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [39]:
### Logistic Regression

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_reg = round( clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_log_reg) + '%')

Train Accuracy: 77.39%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
### Support Vector Machine

clf = SVC()
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
acc_svc = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_svc) + '%')

Train Accuracy: 84.02%


In [49]:
### k -Nearest Neighbors
clf_kn = KNeighborsClassifier(n_neighbors = 3)
clf_kn.fit(X_train, y_train)
y_pred_knn = clf_kn.predict(X_test)
acc_knn = round(clf_kn.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_knn) + '%')

Train Accuracy: 99.78%


In [46]:
y_pred = clf_kn.predict(test_data)
y_pred

array([3, 2, 2, ..., 4, 2, 1], dtype=int64)

In [47]:
res_knn = pd.DataFrame(y_pred) #target is nothing but the final predictions of your model on input features of your new unseen test data
#res.index = test_data.index # its important for comparison. Here "test_new" is your new test dataset
res_knn.columns = ["prediction"]
res_knn.to_csv("submission_rfc_res_knn.csv")      # the csv file will be saved locally on the same location where this notebook is located.