In [1]:
from SCreator import FileReader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier as xgb



In [2]:
def make_graph(data):
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.xlabel('Wavelength')
    plt.ylabel('Reflectance')
    for i in range(5):
        plt.scatter(data["wavelength"],data["reflectance"], s=1)
    plt.show()

In [3]:
def static_vars(**kwargs):
    def decorate(func):
        for k in kwargs:
            setattr(func, k, kwargs[k])
        return func
    return decorate


In [4]:
def create_base_data(data, dest_data, x_range, s_range):
    for i in range(len(data)):
        mini_frame = []
        mini_frame.append(data[i].average_for_samples(x_range[0],x_range[1]))
        part_of_egg = ''.join(filter(lambda x: not x.isdigit(), data[i].samples[0].part_class))
        mini_frame.append(''.join(filter(lambda x: not x.isdigit(), part_of_egg)))
        mini_frame.append(not data[i].samples[0].ill)
        mini_frame.append(data[i].samples[0].calculate_slope(s_range[0], s_range[1]))
        dest_data.append(mini_frame)

@static_vars(area_counter=1)      
def add_another_area(data_healthy, data_ill, dest_df, x_range):
    area_to_append = []
    for i in range(len(data_healthy)):
        area_to_append.append(data_healthy[i].average_for_samples(x_range[0],x_range[1]))
    for i in range(len(data_ill)):
        area_to_append.append(data_ill[i].average_for_samples(x_range[0],x_range[1])) 
    dest_df["Area_" + str(add_another_area.area_counter)] = pd.DataFrame(area_to_append)
    add_another_area.area_counter +=1

In [5]:
def run_cv(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    model_pred = model.predict(x_test)
    
    print("Accuracy: ", accuracy_score(y_test, model_pred))
    print(classification_report(y_test, model_pred))
    return model

def run_gs_fine(c_pow, gamma_pow,x_train_scaled,y_train ):
    c_powers = [pow(2, i) for i in np.arange(c_pow-2, c_pow+2, 0.25)]
    gamma_powers = [pow(2, i) for i in np.arange(gamma_pow-2, gamma_pow+2, 0.25)]
    parameters = {'C': c_powers, 
              'gamma': gamma_powers,
              'kernel': ['rbf']}
    
    grid_svc = GridSearchCV(SVC(), param_grid=parameters, scoring='accuracy')
    grid_svc.fit(x_train_scaled, y_train)
    return grid_svc.best_params_
    
def run_gs(x_train_scaled,y_train):
    c_powers = [pow(2, i) for i in range(-5, 16,2)]
    gamma_powers = [pow(2, i) for i in range(-15, 4,2)]
    parameters = {'C': c_powers, 
              'gamma': gamma_powers,
              'kernel': ['rbf']}
    grid_svc = GridSearchCV(SVC(), param_grid=parameters, scoring='accuracy')
    grid_svc.fit(x_train_scaled, y_train)
    return run_gs_fine(np.log2(grid_svc.best_params_ ['C']), np.log2(grid_svc.best_params_ ['gamma']),x_train_scaled,y_train)

In [6]:
eggs_healthy = []
eggs_ill = []
dir_path_healthy = 'data2k17/Healthy_white/'
dir_path_ill = 'data2k17/ill_white/'
dir_path_healthy_dark = 'data2k17/Healthy_dark/'
dir_path_ill_dark = 'data2k17/ill_dark/'

In [7]:
eggs_healthy = FileReader.FileReader(dir_path_healthy).read_files()
eggs_healthy_dark = FileReader.FileReader(dir_path_healthy_dark).read_files()
eggs_ill = FileReader.FileReader(dir_path_ill).read_files()
eggs_ill_dark = FileReader.FileReader(dir_path_ill_dark).read_files()

In [8]:
print(len(eggs_healthy), len(eggs_healthy_dark), len(eggs_ill), len(eggs_ill_dark))

113 98 184 115


In [9]:
data = []
x_range = [630, 650]
slope_range = [622,645]

In [10]:
create_base_data(eggs_healthy_dark, data, x_range, slope_range)
create_base_data(eggs_ill_dark, data, x_range, slope_range)

In [11]:
df = pd.DataFrame(data, columns = ["Area", "EggPart", "Healthy","Slope"])

In [12]:
df.groupby("Healthy").describe()

Unnamed: 0_level_0,Area,Area,Area,Area,Area,Area,Area,Area,Slope,Slope,Slope,Slope,Slope,Slope,Slope,Slope
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Healthy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
False,115.0,74.460977,20.377634,24.228405,57.413979,73.534847,92.015621,130.476797,115.0,-0.007312,0.001833,-0.011611,-0.008542,-0.007322,-0.005946,-0.003641
True,98.0,50.265053,13.066979,25.532637,39.359734,52.153555,59.443164,82.785315,98.0,-0.008737,0.001496,-0.01183,-0.009801,-0.008743,-0.007627,-0.005825


In [13]:
df

Unnamed: 0,Area,EggPart,Healthy,Slope
0,62.781153,b,True,-0.006387
1,76.007158,c,True,-0.008719
2,56.728077,a,True,-0.006987
3,57.179068,b,True,-0.006792
4,56.707525,b,True,-0.006863
...,...,...,...,...
208,46.182127,a,False,-0.006212
209,48.974754,c,False,-0.008912
210,57.505396,h,False,-0.009215
211,48.150500,zz,False,-0.007194


## Step 1 with the cat division and integral under one area

In [14]:
part_mapping = {'a' : 1, 'h' : 2, 'z' : 3, 'c' : 4, 'hh' : 5, 'zz' : 6, 'b' : 7}
df["EggPart"] = df["EggPart"].map(part_mapping)

In [15]:
x = df.drop('Healthy', axis=1)
y = df['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)

In [16]:
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [17]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.9069767441860465
              precision    recall  f1-score   support

       False       0.96      0.88      0.92        26
        True       0.84      0.94      0.89        17

    accuracy                           0.91        43
   macro avg       0.90      0.91      0.90        43
weighted avg       0.91      0.91      0.91        43



In [18]:
mxgb = xgb(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=4, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.3, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =2, nthread = -1)
mxg = run_cv(mxgb, x_train, y_train, x_test, y_test)

Accuracy:  0.8837209302325582
              precision    recall  f1-score   support

       False       0.89      0.92      0.91        26
        True       0.88      0.82      0.85        17

    accuracy                           0.88        43
   macro avg       0.88      0.87      0.88        43
weighted avg       0.88      0.88      0.88        43



### w/o outliers


In [19]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_out = df[~(((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))).any(axis=1)]
df_out.shape

(211, 4)

In [20]:
x = df_out.drop('Healthy', axis=1)
y = df_out['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)

In [21]:
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [22]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.813953488372093
              precision    recall  f1-score   support

       False       0.91      0.77      0.83        26
        True       0.71      0.88      0.79        17

    accuracy                           0.81        43
   macro avg       0.81      0.83      0.81        43
weighted avg       0.83      0.81      0.82        43



In [23]:
mxgb = xgb(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=4, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.3, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =2, nthread = -1)
mxg = run_cv(mxgb, x_train, y_train, x_test, y_test)

Accuracy:  0.7674418604651163
              precision    recall  f1-score   support

       False       0.86      0.73      0.79        26
        True       0.67      0.82      0.74        17

    accuracy                           0.77        43
   macro avg       0.77      0.78      0.76        43
weighted avg       0.79      0.77      0.77        43



# Step 2 adding more areas

In [24]:
add_another_area(eggs_ill_dark, eggs_healthy_dark, df, [570, 590])

In [25]:
x = df.drop('Healthy', axis=1)
y = df['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [26]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.8837209302325582
              precision    recall  f1-score   support

       False       1.00      0.81      0.89        26
        True       0.77      1.00      0.87        17

    accuracy                           0.88        43
   macro avg       0.89      0.90      0.88        43
weighted avg       0.91      0.88      0.88        43



In [27]:
mxgb = xgb(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=4, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.3, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =2, nthread = -1)
mxg = run_cv(mxgb, x_train, y_train, x_test, y_test)

Accuracy:  0.9069767441860465
              precision    recall  f1-score   support

       False       0.92      0.92      0.92        26
        True       0.88      0.88      0.88        17

    accuracy                           0.91        43
   macro avg       0.90      0.90      0.90        43
weighted avg       0.91      0.91      0.91        43



## Another area

In [28]:
add_another_area(eggs_ill, eggs_healthy, df, [480, 500])

In [29]:
x = df.drop('Healthy', axis=1)
y = df['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [30]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.8837209302325582
              precision    recall  f1-score   support

       False       1.00      0.81      0.89        26
        True       0.77      1.00      0.87        17

    accuracy                           0.88        43
   macro avg       0.89      0.90      0.88        43
weighted avg       0.91      0.88      0.88        43



In [31]:
df.groupby("Healthy").describe()

Unnamed: 0_level_0,Area,Area,Area,Area,Area,Area,Area,Area,EggPart,EggPart,...,Area_1,Area_1,Area_2,Area_2,Area_2,Area_2,Area_2,Area_2,Area_2,Area_2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Healthy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
False,115.0,74.460977,20.377634,24.228405,57.413979,73.534847,92.015621,130.476797,115.0,2.869565,...,29.446231,61.605607,115.0,15.593323,5.249928,4.415203,11.819941,15.507462,19.327402,27.174533
True,98.0,50.265053,13.066979,25.532637,39.359734,52.153555,59.443164,82.785315,98.0,4.244898,...,49.837029,86.6971,98.0,12.782423,4.366475,5.260209,9.928306,12.148043,15.456085,23.855281


## Next approach with adding the distance to the mean

In [449]:
df = pd.DataFrame(data, columns = ["Area", "EggPart", "Healthy",])

In [452]:
add_another_area(eggs_healthy, eggs_ill, df, [550, 580])
add_another_area(eggs_healthy, eggs_ill, df, [640, 660])

In [33]:
df.head()

Unnamed: 0,Area,EggPart,Healthy,Slope,Area_1,Area_2
0,62.781153,7,True,-0.006387,23.825913,8.126561
1,76.007158,4,True,-0.008719,17.73796,10.794771
2,56.728077,1,True,-0.006987,27.515401,8.406852
3,57.179068,7,True,-0.006792,31.887476,5.996202
4,56.707525,7,True,-0.006863,26.11571,7.214106


In [34]:
df = df.join(df.groupby("EggPart").agg(np.mean), on='EggPart', rsuffix='_mean')
df["Area_mean_distance"] = abs(df["Area"] - df["Area_mean"])
df["Area_mean_distance_1"] = abs(df["Area_1"] - df["Area_1_mean"])
df["Area_mean_distance_2"] = abs(df["Area_2"] - df["Area_2_mean"])
df["Slope_mean_distance"] = abs(df["Slope"] - df["Slope_mean"])


col_to_drop = ["Area_mean", "Healthy_mean", "Area_1_mean", "Area_2_mean", "Slope_mean"]
df.drop(col_to_drop, axis = 1, inplace = True)

In [472]:
part_mapping = {'a' : 1, 'h' : 2, 'z' : 3, 'c' : 4, 'hh' : 5, 'zz' : 6, 'b' : 7}
#part_mapping = {'a' : 1, 'h' : 2, 'z' : 3, 'c' : 4, 'hh' : 2, 'zz' : 3, 'b' : 7 , 'bb' : 7}
df["EggPart"] = df["EggPart"].map(part_mapping)

In [35]:
x = df.drop('Healthy', axis=1)
y = df['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)

In [36]:
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [37]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.9069767441860465
              precision    recall  f1-score   support

       False       0.96      0.88      0.92        26
        True       0.84      0.94      0.89        17

    accuracy                           0.91        43
   macro avg       0.90      0.91      0.90        43
weighted avg       0.91      0.91      0.91        43



In [39]:
mxgb = xgb(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=4, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.3, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =2, nthread = -1)
mxg = run_cv(mxgb, x_train, y_train, x_test, y_test)

Accuracy:  0.9302325581395349
              precision    recall  f1-score   support

       False       0.96      0.92      0.94        26
        True       0.89      0.94      0.91        17

    accuracy                           0.93        43
   macro avg       0.92      0.93      0.93        43
weighted avg       0.93      0.93      0.93        43



## Treating double letters as one

In [272]:
df = pd.DataFrame(data, columns = ["Area", "EggPart", "Healthy"])
part_mapping = {'a' : 1, 'h' : 2, 'z' : 3, 'c' : 4, 'hh' : 2, 'zz' : 3, 'b' : 7 , 'bb' : 7}
df["EggPart"] = df["EggPart"].map(part_mapping)

In [273]:
df = df.join(df.groupby("EggPart").agg(np.mean), on='EggPart', rsuffix='_mean')
df["Area_mean_distance"] = abs(df["Area"] - df["Area_mean"])
col_to_drop = ["Area_mean", "Healthy_mean"]
df.drop(col_to_drop, axis = 1, inplace = True)
df

Unnamed: 0,Area,EggPart,Healthy,Area_mean_distance
0,90.364106,1,False,23.564389
1,112.639061,2,False,0.773452
2,94.844660,3,False,13.592073
3,80.115887,1,False,33.812609
4,94.525620,4,False,27.200040
...,...,...,...,...
292,117.739953,7,True,3.532563
293,115.915473,7,True,5.357044
294,134.999146,4,True,13.273486
295,114.142792,1,True,0.214297


In [277]:
x = df.drop('Healthy', axis=1)
y = df['Healthy']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
sc = StandardScaler()
x_train_scaled, x_test_scaled = sc.fit_transform(x_train), sc.fit_transform(x_test)

In [278]:
gs_params = run_gs(x_train_scaled, y_train)
svc = SVC(C=gs_params["C"], gamma= gs_params["gamma"])

In [279]:
svc = run_cv(svc, x_train_scaled, y_train, x_test_scaled, y_test)

Accuracy:  0.8333333333333334
Log loss:  5.756489385732787
              precision    recall  f1-score   support

       False       0.83      0.95      0.88        40
        True       0.86      0.60      0.71        20

    accuracy                           0.83        60
   macro avg       0.84      0.77      0.79        60
weighted avg       0.84      0.83      0.82        60



In [232]:
rf = RandomForestClassifier(n_estimators=100)
rf =  run_cv(rf, X_train, Y_train, X_test, Y_test)

Accuracy:  0.8
Log loss:  6.907808585477483
              precision    recall  f1-score   support

       False       0.82      0.90      0.86        20
        True       0.75      0.60      0.67        10

    accuracy                           0.80        30
   macro avg       0.78      0.75      0.76        30
weighted avg       0.80      0.80      0.79        30



In [243]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [0.9, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)


Unnamed: 0,Model,Score
3,Random Forest,100.0
8,Decision Tree,100.0
1,KNN,87.27
4,Naive Bayes,83.52
2,Logistic Regression,81.65
5,Perceptron,61.42
6,Stochastic Gradient Decent,61.42
7,Linear SVC,41.95
0,Support Vector Machines,0.9
