# Code background

In [416]:
# Maths
import pandas as pd
import numpy as np
from scipy.stats import zscore
import random

# Plots
import plotly.graph_objects as go

# Dim Reduction
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

# ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Load Data

In [417]:
db_location = "glass_fragments_procssed.xlsx"
db = pd.read_excel(db_location,sheet_name="all")

BINA

In [418]:
BINA_features = ['NaK', 'MgK', 'AlK', 'SiK', 'S K', 'ClK', 'K KA', 'K KB', 'CaKA', 'CaKB', 'TiK', 'CrK', 'MnK', 'FeK', 'CoKA', 'CuKA', 'ZnKA', 'SrK']
BINA = db[db["Source"] == "BINA"]
BINA[BINA_features] = BINA[BINA_features].apply(zscore)
BINA.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(92, 64)

IRB

In [419]:
IRB = db[db["Source"] == "IRB"]
IRB_features = ['NaK', 'MgK', 'AlK', 'SiK', 'S K',  'K KA', 'CaKA', 'CrK', 'MnK', 'FeK']
IRB[IRB_features] = IRB[IRB_features].apply(zscore)
IRB.shape

(96, 64)

HU

In [420]:
HU = db[db["Source"] == "HU_3"]
HU_features = ['AlK', 'SiK',  'K KA', 'CaKA', 'TiK', 'FeK']
HU[HU_features] = HU[HU_features].apply(zscore)
HU.shape

(96, 64)

DEB

In [421]:
DEB = db[db["Source"] == "DEB"]
DEB_features = ['NaK', 'MgK', 'AlK', 'SiK',  'K KA', 'CaKA', 'TiK', 'MnK' , 'FeK', 'CuKA', 'ZnKA', 'SrK', 'SnL1', 'Ba', 'Zr','Ce', 'Rb','Ga','B','La','Nd', 'Y','Pb','Th','Hf','Pr','As','Ge','U','Nb', 'Sm','Gd','Dy','Yb','Er','Cs','Sb','Ho','Eu','Tb','In','Lu','Tm', 'Bi']
DEB[DEB_features] = DEB[DEB_features].apply(zscore)
DEB.shape

(48, 64)

BARC

In [422]:
BARC = db[db["Source"] == "BARC"]
BARC_features = ['NaK', 'MgK', 'AlK', 'SiK', 'CaKA', 'CrK', 'FeK','CoKA', 'ZnKA',  'Ba', 'Zr','Ce', 'Rb','La','Th','Hf', 'Sm','Yb','Cs','Eu', 'Sc', 'Ta']
BARC[BARC_features] = BARC[BARC_features].apply(zscore)
BARC.shape

(48, 64)

EDS

In [423]:
EDS = db[db["Source"] == "SEM"]
EDS_features = ['NaK', 'MgK', 'AlK', 'SiK',  'K KA', 'CaKA', 'FeK']
EDS[EDS_features] = EDS[EDS_features].apply(zscore)
EDS.shape

(96, 64)

NARD

In [424]:
NARD = db[db["Source"] == "NARD"]
NARD_features = ['NaK', 'MgK', 'AlK', 'SiK', 'ClK', 'K KA', 'CaKA', 'TiK', 'MnK', 'FeK', 'SrK','B', 'Sm','Gd']
NARD[NARD_features] = NARD[NARD_features].apply(zscore)
NARD.shape

(20, 64)

# Visualization

## BINA

In [425]:
pca_bina = PCA(n_components=2)
X_embedded = pca_bina.fit_transform(BINA[BINA_features])

fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Mazda",0], y=X_embedded[BINA['model']=="Mazda",1],mode='markers', name="MAZDA"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Fiat",0], y=X_embedded[BINA['model']=="Fiat",1],mode='markers', name="FIAT"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Hyundai",0], y=X_embedded[BINA['model']=="Hyundai",1],mode='markers', name="HYNDAI"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Honda",0], y=X_embedded[BINA['model']=="Honda",1],mode='markers', name="HUNDA"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Ford",0], y=X_embedded[BINA['model']=="Ford",1],mode='markers', name="FORD"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Daewoo",0], y=X_embedded[BINA['model']=="Daewoo",1],mode='markers', name="Daewoo"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Subaru",0], y=X_embedded[BINA['model']=="Subaru",1],mode='markers', name="Subaru"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Renault",0], y=X_embedded[BINA['model']=="Renault",1],mode='markers', name="Renault"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Peugeot",0], y=X_embedded[BINA['model']=="Peugeot",1],mode='markers', name="Peugeot"))
fig.add_trace(go.Scatter(x=X_embedded[BINA['model']=="Mitsubishi",0], y=X_embedded[BINA['model']=="Mitsubishi",1],mode='markers', name="Mitsubishi"))


## DEB

In [426]:
pca_db = DEB
pca_db_features = DEB_features

pca = PCA(n_components=2)
X_embedded = pca.fit_transform(pca_db[pca_db_features])

fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mazda",0], y=X_embedded[pca_db['model']=="Mazda",1],mode='markers', name="MAZDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Fiat",0], y=X_embedded[pca_db['model']=="Fiat",1],mode='markers', name="FIAT"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Hyundai",0], y=X_embedded[pca_db['model']=="Hyundai",1],mode='markers', name="HYNDAI"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Honda",0], y=X_embedded[pca_db['model']=="Honda",1],mode='markers', name="HUNDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Ford",0], y=X_embedded[pca_db['model']=="Ford",1],mode='markers', name="FORD"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Daewoo",0], y=X_embedded[pca_db['model']=="Daewoo",1],mode='markers', name="Daewoo"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Subaru",0], y=X_embedded[pca_db['model']=="Subaru",1],mode='markers', name="Subaru"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Renault",0], y=X_embedded[pca_db['model']=="Renault",1],mode='markers', name="Renault"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Peugeot",0], y=X_embedded[pca_db['model']=="Peugeot",1],mode='markers', name="Peugeot"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mitsubishi",0], y=X_embedded[pca_db['model']=="Mitsubishi",1],mode='markers', name="Mitsubishi"))

In [427]:
pca_db = BARC
pca_db_features = BARC_features

pca = PCA(n_components=2)
X_embedded = pca.fit_transform(pca_db[pca_db_features])

fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mazda",0], y=X_embedded[pca_db['model']=="Mazda",1],mode='markers', name="MAZDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Fiat",0], y=X_embedded[pca_db['model']=="Fiat",1],mode='markers', name="FIAT"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Hyundai",0], y=X_embedded[pca_db['model']=="Hyundai",1],mode='markers', name="HYNDAI"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Honda",0], y=X_embedded[pca_db['model']=="Honda",1],mode='markers', name="HUNDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Ford",0], y=X_embedded[pca_db['model']=="Ford",1],mode='markers', name="FORD"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Daewoo",0], y=X_embedded[pca_db['model']=="Daewoo",1],mode='markers', name="Daewoo"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Subaru",0], y=X_embedded[pca_db['model']=="Subaru",1],mode='markers', name="Subaru"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Renault",0], y=X_embedded[pca_db['model']=="Renault",1],mode='markers', name="Renault"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Peugeot",0], y=X_embedded[pca_db['model']=="Peugeot",1],mode='markers', name="Peugeot"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mitsubishi",0], y=X_embedded[pca_db['model']=="Mitsubishi",1],mode='markers', name="Mitsubishi"))

In [428]:
pca_db = NARD
pca_db_features = NARD_features

pca = PCA(n_components=2)
X_embedded = pca.fit_transform(pca_db[pca_db_features])

fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mazda",0], y=X_embedded[pca_db['model']=="Mazda",1],mode='markers', name="MAZDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Fiat",0], y=X_embedded[pca_db['model']=="Fiat",1],mode='markers', name="FIAT"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Hyundai",0], y=X_embedded[pca_db['model']=="Hyundai",1],mode='markers', name="HYNDAI"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Honda",0], y=X_embedded[pca_db['model']=="Honda",1],mode='markers', name="HUNDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Ford",0], y=X_embedded[pca_db['model']=="Ford",1],mode='markers', name="FORD"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Daewoo",0], y=X_embedded[pca_db['model']=="Daewoo",1],mode='markers', name="Daewoo"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Subaru",0], y=X_embedded[pca_db['model']=="Subaru",1],mode='markers', name="Subaru"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Renault",0], y=X_embedded[pca_db['model']=="Renault",1],mode='markers', name="Renault"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Peugeot",0], y=X_embedded[pca_db['model']=="Peugeot",1],mode='markers', name="Peugeot"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mitsubishi",0], y=X_embedded[pca_db['model']=="Mitsubishi",1],mode='markers', name="Mitsubishi"))

In [429]:
pca_db = EDS
pca_db_features = EDS_features

pca = PCA(n_components=2)
X_embedded = pca.fit_transform(pca_db[pca_db_features])

fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mazda",0], y=X_embedded[pca_db['model']=="Mazda",1],mode='markers', name="MAZDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Fiat",0], y=X_embedded[pca_db['model']=="Fiat",1],mode='markers', name="FIAT"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Hyundai",0], y=X_embedded[pca_db['model']=="Hyundai",1],mode='markers', name="HYNDAI"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Honda",0], y=X_embedded[pca_db['model']=="Honda",1],mode='markers', name="HUNDA"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Ford",0], y=X_embedded[pca_db['model']=="Ford",1],mode='markers', name="FORD"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Daewoo",0], y=X_embedded[pca_db['model']=="Daewoo",1],mode='markers', name="Daewoo"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Subaru",0], y=X_embedded[pca_db['model']=="Subaru",1],mode='markers', name="Subaru"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Renault",0], y=X_embedded[pca_db['model']=="Renault",1],mode='markers', name="Renault"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Peugeot",0], y=X_embedded[pca_db['model']=="Peugeot",1],mode='markers', name="Peugeot"))
fig.add_trace(go.Scatter(x=X_embedded[pca_db['model']=="Mitsubishi",0], y=X_embedded[pca_db['model']=="Mitsubishi",1],mode='markers', name="Mitsubishi"))

# Model

In [486]:
iteration = 20
test_ratio = 0.33
num_tree = 500
class_rf = RandomForestClassifier(n_estimators=num_tree)

In [431]:
def make_a_model(X,y):
    results = []
    for i in range(iteration):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio)
        class_rf.fit(X_train, y_train)
        y_pred = class_rf.predict(X_test)
        results.append(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, zero_division=0)))
        
    precision = np.array([])
    recall = np.array([])
    f1score = np.array([])

    for i in range(iteration):
        precision = np.append(precision, results[i]["weighted avg"]["precision"])
        recall = np.append(recall, results[i]["weighted avg"]["recall"])
        f1score = np.append(f1score, results[i]["weighted avg"]["f1-score"])

    print("Recall mean:", recall.mean(), "Recall STD:", recall.std())
    print("Precision mean:", precision.mean(), "Precision STD:", precision.std())
    print("F1- Score mean:", f1score.mean(), "F1- Score STD:", f1score.std())

## PIXE

### Model BINA

In [432]:
make_a_model(BINA[BINA_features],BINA["model"])

Recall mean: 0.8532258064516128 Recall STD: 0.06409134350421138
Precision mean: 0.869676488386166 Precision STD: 0.0581783136207324
F1- Score mean: 0.8390815093879045 F1- Score STD: 0.06486303514989744


### Model IRB

In [433]:
make_a_model(IRB[IRB_features],IRB["model"])

Recall mean: 0.815625 Recall STD: 0.06695089151758923
Precision mean: 0.8225443176615052 Precision STD: 0.08859537921686132
F1- Score mean: 0.7975389444308272 F1- Score STD: 0.07252632108932823


### Model HU

In [434]:
make_a_model(HU[HU_features],HU["model"])

Recall mean: 0.715625 Recall STD: 0.06547602137424051
Precision mean: 0.7263386093073592 Precision STD: 0.09626679767551449
F1- Score mean: 0.6910683376843864 F1- Score STD: 0.07191229602350928


## Model DEB

In [435]:
make_a_model(DEB[DEB_features],DEB["model"])

Recall mean: 0.796875 Recall STD: 0.11158481919598204
Precision mean: 0.7634040178571428 Precision STD: 0.1395686365780337
F1- Score mean: 0.7643027907549966 F1- Score STD: 0.12934574725128173


## Model BARC

In [436]:
make_a_model(BARC[BARC_features],BARC["model"])

Recall mean: 0.896875 Recall STD: 0.10874820400815823
Precision mean: 0.8466046626984127 Precision STD: 0.15197591607300837
F1- Score mean: 0.8657811719530469 F1- Score STD: 0.13707338685704648


## Model EDS

In [437]:
make_a_model(EDS[EDS_features],EDS["model"])

Recall mean: 0.5390625 Recall STD: 0.09974332292815394
Precision mean: 0.49674908922266453 Precision STD: 0.12561651668089263
F1- Score mean: 0.4930868915888803 F1- Score STD: 0.11214012185009495


## Model NARD

In [438]:
make_a_model(NARD[NARD_features],NARD["model"])

Recall mean: 0.6214285714285712 Recall STD: 0.15169114701139502
Precision mean: 0.5499999999999999 Precision STD: 0.15827514147257385
F1- Score mean: 0.5742857142857142 F1- Score STD: 0.15466590429087262


# Create a unified DB

In [439]:
iteration = 20
test_ratio = 0.33
num_tree = 300

## All Vs ALL

In [440]:
WWDB = pd.concat([
    BINA,
    IRB,
    HU,
    EDS, 
    BARC, 
    DEB,
    NARD
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    set(HU_features)& 
    set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)&
    set(NARD_features)
    )

print(WWfeatures)
print(WWDB.shape)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['SiK', 'AlK', 'CaKA', 'FeK']
(496, 64)
Recall mean: 0.6704268292682928 Recall STD: 0.03542231615600592
Precision mean: 0.6763540340648999 Precision STD: 0.03296738759619958
F1- Score mean: 0.6634224566359713 F1- Score STD: 0.03255528660234485


Repeat without laboratories with individual low performances

In [467]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    #EDS, 
    BARC, 
    DEB,
    #NARD
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)#&
    #set(NARD_features)
    )

print(WWfeatures)
print(WWDB.shape)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
(284, 64)
Recall mean: 0.7965957446808511 Recall STD: 0.03833567873891135
Precision mean: 0.820410238839891 Precision STD: 0.03749004570828338
F1- Score mean: 0.7901243393634269 F1- Score STD: 0.03957407913111199


## Individual models on subgroup feature list

### PGAA

#### BARC

In [468]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    BARC, 
    #DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.4465625 Recall STD: 0.11064397789193049
Precision mean: 0.37815926001082245 Precision STD: 0.1463453341749608
F1- Score mean: 0.3844834150224775 F1- Score STD: 0.1204012866867921


#### NARD

In [476]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    #BARC, 
    #DEB,
    NARD
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)#& 
    set(BARC_features)& 
    set(DEB_features)&
    set(NARD_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.4707142857142857 Recall STD: 0.16951160335938478
Precision mean: 0.3928571428571429 Precision STD: 0.1852232519453875
F1- Score mean: 0.4122976190476191 F1- Score STD: 0.1766053873376044


#### BARC + NARD

In [443]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    BARC, 
    #DEB,
    NARD
    ], axis=0)

WWfeatures = list(
    #set(BINA_features) & 
    #set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)& 
    set(BARC_features)& 
    #set(DEB_features)&
    set(NARD_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'Sm', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.6782608695652173 Recall STD: 0.0905768115478246
Precision mean: 0.6549609448522491 Precision STD: 0.1300312559787689
F1- Score mean: 0.6317734730458515 F1- Score STD: 0.11364409205308494


### PIXE

In [444]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    #EDS, 
    #BARC, 
    #DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.8341269841269842 Recall STD: 0.042436204397761426
Precision mean: 0.8661228664334777 Precision STD: 0.03065710250393438
F1- Score mean: 0.8252620102429564 F1- Score STD: 0.045334620698462495


### EDS

In [445]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    EDS, 
    #BARC, 
    #DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.4453125 Recall STD: 0.05918967789023691
Precision mean: 0.40842744791757946 Precision STD: 0.08053449835477493
F1- Score mean: 0.4020279963191228 F1- Score STD: 0.06324389011709783


ICP

In [446]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    #BARC, 
    DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.715625 Recall STD: 0.1272838240901019
Precision mean: 0.7051314484126985 Precision STD: 0.16498665967899842
F1- Score mean: 0.6824079811120252 F1- Score STD: 0.1491574545521411


## DB Combinations

### PIXE with PIXE

In [447]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    #EDS, 
    #BARC, 
    #DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)#& 
    #set(HU_features)#& 
    #set(EDS_features)#& 
    #set(BARC_features)#& 
    #set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['K KA', 'CrK', 'MgK', 'AlK', 'NaK', 'MnK', 'SiK', 'FeK', 'CaKA', 'S K']
Recall mean: 0.8936507936507937 Recall STD: 0.04711372088779628
Precision mean: 0.9192896771128194 Precision STD: 0.03087463100591033
F1- Score mean: 0.8881122357018553 F1- Score STD: 0.051319444768233524


### PIXE with EDS

In [448]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    EDS, 
    #BARC, 
    #DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)#& 
    #set(BARC_features)#& 
    #set(DEB_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['K KA', 'MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.800531914893617 Recall STD: 0.03574135622836668
Precision mean: 0.8203762813684206 Precision STD: 0.033477690325361874
F1- Score mean: 0.7941341429784652 F1- Score STD: 0.03832375667044371


### PIXE with PGAA

#### PIXE with PGAA (BARC)

In [449]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    #EDS, 
    BARC, 
    #DEB,
    #NARD
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)#& 
    set(BARC_features)#& 
    #set(DEB_features)
    #set(NARD_features)
    )

print(WWfeatures)
    
make_a_model(WWDB[WWfeatures],WWDB["model"])

['CrK', 'MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.7858974358974359 Recall STD: 0.0413647841151325
Precision mean: 0.8142747454002427 Precision STD: 0.04791617793118439
F1- Score mean: 0.7734341047718329 F1- Score STD: 0.04731123800388816


### PIXE with ICP

In [450]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    #EDS, 
    #BARC, 
    DEB,
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)#& 
    #set(BARC_features)& 
    set(DEB_features)
    )
    
print(WWfeatures)

make_a_model(WWDB[WWfeatures],WWDB["model"])

['K KA', 'MgK', 'AlK', 'NaK', 'MnK', 'SiK', 'FeK', 'CaKA']
Recall mean: 0.9096153846153847 Recall STD: 0.032553753655370825
Precision mean: 0.9206332574419545 Precision STD: 0.03161156560382684
F1- Score mean: 0.9042962416996492 F1- Score STD: 0.03559445587677968


### PGAA with EDS

In [451]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    EDS, 
    BARC, 
    #DEB,
    #NARD
    
    ], axis=0)

WWfeatures = list(
    #set(BINA_features) & 
    #set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)& 
    set(BARC_features)#& 
    #set(DEB_features)
    #set(NARD_features)
    )
    
print(WWfeatures)

make_a_model(WWDB[WWfeatures],WWDB["model"])

['MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.475 Recall STD: 0.05128555677121322
Precision mean: 0.4599405002353071 Precision STD: 0.0806715536830451
F1- Score mean: 0.4328562272781927 F1- Score STD: 0.06198224730714611


### PGAA with ICP

In [452]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    BARC, 
    DEB
    ], axis=0)

WWfeatures = list(
    #set(BINA_features) & 
    #set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )
    
print(WWfeatures)

make_a_model(WWDB[WWfeatures],WWDB["model"])

['La', 'Zr', 'Hf', 'MgK', 'Sm', 'Yb', 'Th', 'Ce', 'AlK', 'NaK', 'FeK', 'SiK', 'Eu', 'ZnKA', 'Cs', 'CaKA', 'Ba', 'Rb']
Recall mean: 0.878125 Recall STD: 0.06242182611074431
Precision mean: 0.8798652952949828 Precision STD: 0.071314164176788
F1- Score mean: 0.8591085729638719 F1- Score STD: 0.07301427834773717


### ICP with EDS

In [453]:
WWDB = pd.concat([
    #BINA,
    #IRB,
    #HU,
    #EDS, 
    BARC, 
    DEB
    ], axis=0)

WWfeatures = list(
    #set(BINA_features) & 
    #set(IRB_features)& 
    #set(HU_features)& 
    #set(EDS_features)& 
    set(BARC_features)& 
    set(DEB_features)
    )
    
print(WWfeatures)

make_a_model(WWDB[WWfeatures],WWDB["model"])

['La', 'Zr', 'Hf', 'MgK', 'Sm', 'Yb', 'Th', 'Ce', 'AlK', 'NaK', 'FeK', 'SiK', 'Eu', 'ZnKA', 'Cs', 'CaKA', 'Ba', 'Rb']
Recall mean: 0.8734375 Recall STD: 0.04787985451888925
Precision mean: 0.8526749769327895 Precision STD: 0.07277908323500434
F1- Score mean: 0.8486593808401967 F1- Score STD: 0.059978072280265596


### PIXE, EDS and ICP

In [454]:
WWDB = pd.concat([
    BINA,
    IRB,
    #HU,
    EDS, 
    #BARC, 
    DEB
    ], axis=0)

WWfeatures = list(
    set(BINA_features) & 
    set(IRB_features)& 
    #set(HU_features)& 
    set(EDS_features)& 
    #set(BARC_features)& 
    set(DEB_features)
    )
    
print(WWfeatures)

make_a_model(WWDB[WWfeatures],WWDB["model"])

['K KA', 'MgK', 'AlK', 'NaK', 'FeK', 'SiK', 'CaKA']
Recall mean: 0.8395454545454546 Recall STD: 0.03476901346775114
Precision mean: 0.8555780463427445 Precision STD: 0.03244546529786464
F1- Score mean: 0.8347315593333056 F1- Score STD: 0.037223680655200986


# Exploration

## Feature importance

Finding our BARC most important features

BARC

In [455]:
features = BARC_features
Tested_db = BARC
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
Sc,0.078338
ZnKA,0.070762
Ce,0.069732
Eu,0.064318
CoKA,0.063435
Sm,0.061708
Rb,0.055854
Hf,0.054132
Zr,0.051299
Cs,0.050013


NARD

In [456]:
features = NARD_features
Tested_db = NARD
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
ClK,0.103926
TiK,0.088571
SiK,0.088176
MnK,0.083864
AlK,0.081207
B,0.077404
Sm,0.073098
K KA,0.071787
FeK,0.068424
MgK,0.064471


BINA

In [457]:
features = BINA_features
Tested_db = BINA
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(11)

Unnamed: 0,0
AlK,0.104397
FeK,0.099599
K KA,0.099347
TiK,0.074233
MnK,0.073843
CaKB,0.072762
CaKA,0.071282
K KB,0.060092
SiK,0.054259
ZnKA,0.050558


IRB

In [458]:
features = IRB_features
Tested_db = IRB
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
CaKA,0.202504
FeK,0.17364
MnK,0.131388
K KA,0.129175
AlK,0.124826
MgK,0.108951
SiK,0.036916
NaK,0.036712
S K,0.02856
CrK,0.027328


HU

In [459]:
features = HU_features
Tested_db = HU
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
FeK,0.243283
K KA,0.215782
TiK,0.168393
SiK,0.132932
AlK,0.131628
CaKA,0.107982


ICP

In [460]:
features = DEB_features
Tested_db = DEB
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
CaKA,0.070997
Bi,0.042062
ZnKA,0.037384
Ce,0.03489
Rb,0.033729
Pb,0.031014
U,0.030298
Ga,0.029401
Th,0.028684
Nd,0.028571


EDS

In [461]:
features = EDS_features
Tested_db = EDS
X_train, X_test, y_train, y_test = train_test_split(Tested_db[features], Tested_db["model"], test_size=test_ratio)
class_rf.fit(X_train, y_train)
db_importance = pd.DataFrame(class_rf.feature_importances_, index =features )
db_importance.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
AlK,0.190487
CaKA,0.183078
K KA,0.161981
MgK,0.142705
FeK,0.108545
NaK,0.10786
SiK,0.105343


## Combining the best of both

In [489]:
best_of = IRB.copy()
best_of = best_of[best_of['type'] == "surface"]
best_features = ['ZnKA','Ce','Zr','Rb','Eu','Sc','CoKA','La']
best_of[best_features] = np.array(BARC[best_features])
best_of_features = IRB_features + best_features

In [490]:
make_a_model(best_of[best_of_features],best_of["model"])

Recall mean: 0.915625 Recall STD: 0.06924537439425105
Precision mean: 0.887079613095238 Precision STD: 0.09414603400170155
F1- Score mean: 0.8955014776889774 F1- Score STD: 0.083616152466353


## Identifying features to improve EDS

### USING RBI

In [None]:
perform2 = dict()
for EDS_f_1 in EDS_features:
    for EDS_f_2 in EDS_features:
        print("Replacing", EDS_f_1, "and", EDS_f_2)
        EDS_cpy = EDS.copy()
        EDS_cpy = EDS_cpy.sort_values(by=['type', 'sample_id'])
        EDS_cpy = EDS_cpy.reset_index()
        EDS_cpy[[EDS_f_1, EDS_f_2]] = IRB_cpy[[EDS_f_1, EDS_f_2]]
        results = []
        test_case = []
        for i in range(iteration):
            X_train, X_test, y_train, y_test = train_test_split(EDS_cpy[EDS_features], EDS_cpy["model"], test_size=test_ratio)
            class_rf.fit(X_train, y_train)
            y_pred = class_rf.predict(X_test)
            results.append(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, zero_division=0)))
         #   test_case.append(class_rf.predict(HU_3_w_unknown[HU_3_w_unknown["type"]=="unknown"][HU_3_w_features]))

        precision = np.array([])
        recall = np.array([])
        f1score = np.array([])

        for i in range(iteration):
            precision = np.append(precision, results[i]["weighted avg"]["precision"])
            recall = np.append(recall, results[i]["weighted avg"]["recall"])
            f1score = np.append(f1score, results[i]["weighted avg"]["f1-score"])

        print("Recall mean:", recall.mean(), "Recall STD:", recall.std())
        print("Precision mean:", precision.mean(), "Precision STD:", precision.std())
        print("F1- Score mean:", f1score.mean(), "F1- Score STD:", f1score.std())
        perform2[EDS_f_1+"_"+EDS_f_2] = f1score.mean()