In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df = pd.read_csv("./avila/training_dataset.csv")

In [3]:
df_test = pd.read_csv("./avila/test_dataset.csv")

In [4]:
df.rename(columns={"F1": "intercolumnar distance",
"F2": "upper margin",
"F3": "lower margin",
"F4": "exploitation",
"F5": "row number",
"F6": "modular ratio",
"F7": "interlinear spacing",
"F8": "weight",
"F9": "peak number",
"F10": "modular ratio/ interlinear spacing"}, inplace=True)

In [5]:
df.head()

Unnamed: 0,id,intercolumnar distance,upper margin,lower margin,exploitation,row number,modular ratio,interlinear spacing,weight,peak number,modular ratio/ interlinear spacing,scribe
0,0,-0.091897,0.2976,0.079145,0.196496,0.261718,1.26996,0.446679,-0.751707,0.001721,0.998901,Philippus
1,1,-0.091897,0.226939,0.267634,0.024091,0.261718,-0.806282,0.597681,-0.601277,0.126447,-0.909619,Paithonius
2,2,0.167323,0.313302,0.168055,-0.383198,0.261718,0.190314,0.824183,0.55825,-0.247731,-0.148073,Marcus
3,3,-0.017834,-0.22843,0.37077,1.293671,0.17234,0.896237,0.182426,0.416867,1.373706,0.868284,Noaelius
4,4,0.043885,0.407516,-0.120014,0.281743,0.261718,-0.183409,0.106925,0.142896,0.531806,-0.101311,Marcus


In [6]:
df.isna().sum()

id                                    0
intercolumnar distance                0
upper margin                          0
lower margin                          0
exploitation                          0
row number                            0
modular ratio                         0
interlinear spacing                   0
weight                                0
peak number                           0
modular ratio/ interlinear spacing    0
scribe                                0
dtype: int64

In [7]:
df["scribe"].unique()

array(['Philippus', 'Paithonius', 'Marcus', 'Noaelius', 'Begonius',
       'Franciscus', 'Ubuntius', 'FerrumEffractarius'], dtype=object)

In [8]:
df["scribe"].replace(to_replace={"Philippus" : 0, "Paithonius" : 1, "Marcus" : 2, "Noaelius" : 3, "Begonius" : 4
                                    , "Franciscus" : 5, "Ubuntius" : 6, "FerrumEffractarius" : 7}, inplace=True)

In [9]:
X = df[["id", "intercolumnar distance", "upper margin", "lower margin", "exploitation", "row number", "modular ratio", "interlinear spacing", "weight", "peak number", "modular ratio/ interlinear spacing"]]

In [10]:
y = df["scribe"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
clf = GradientBoostingClassifier().fit(X_train, y_train)

In [13]:
clf.score(X_test, y_test)

0.9463394342762064

In [14]:
models = {
    "GradientBoostingClassifier" : GradientBoostingClassifier() ,
    "RandomForestClassifier" : RandomForestClassifier(),
    "AdaBoostClassifier" : AdaBoostClassifier(),
    "SVC" : SVC()
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

Training model: GradientBoostingClassifier
Training model: RandomForestClassifier
Training model: AdaBoostClassifier




Training model: SVC




In [15]:
d = {modelName:model.predict(X_test) for modelName, model in models.items()}
table = pd.DataFrame(d)
table["gt"] = y_test.reset_index(drop=True)
table

Unnamed: 0,GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier,SVC,gt
0,3,3,2,2,3
1,4,4,4,2,4
2,5,2,2,2,5
3,4,4,4,2,4
4,0,0,2,2,0
...,...,...,...,...,...
2399,2,2,2,2,2
2400,2,2,2,2,2
2401,5,5,2,2,5
2402,0,0,2,2,0


In [16]:
for modelName, model in models.items():    
    print(f"Evaluating model [{modelName}]:")
    print("Accuracy", accuracy_score(table["gt"], table[modelName]))
    print("Precision", precision_score(table["gt"], table[modelName], average='weighted'))
    print("Recall", recall_score(table["gt"], table[modelName], average='weighted'))

Evaluating model [GradientBoostingClassifier]:
Accuracy 0.9463394342762064
Precision 0.9469894306924242
Recall 0.9463394342762064
Evaluating model [RandomForestClassifier]:
Accuracy 0.9542429284525791
Precision 0.9546880950472497
Recall 0.9542429284525791
Evaluating model [AdaBoostClassifier]:
Accuracy 0.4933444259567388
Precision 0.2976132226996867
Recall 0.4933444259567388
Evaluating model [SVC]:
Accuracy 0.39933444259567386
Precision 0.27312395457607563
Recall 0.39933444259567386


  'precision', 'predicted', average, warn_for)


In [17]:
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.3).fit(X_train, y_train)

In [18]:
clf.score(X_test, y_test)

0.9979201331114809

In [19]:
clf_final = GradientBoostingClassifier(n_estimators=500, learning_rate=0.3).fit(X, y)

In [20]:
preds = clf.predict(df_test)

In [21]:
df_test

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
0,0,-0.042522,0.007106,0.264078,-0.162546,0.172340,-1.055431,0.106925,0.680858,0.781258,-0.901193
1,1,0.315450,0.540986,0.029355,-0.395163,0.261718,-1.055431,0.182426,-0.643408,0.095265,-0.925038
2,2,-0.351118,-0.244132,0.594823,1.592678,0.797987,0.397939,-1.025587,1.157371,1.591976,1.278013
3,3,0.327793,0.336855,-0.020434,1.086893,0.261718,-0.307984,0.220177,0.449873,-0.528364,-0.276773
4,4,0.019197,-0.087108,0.384996,0.794958,0.261718,0.314889,-0.044076,-1.067421,-0.715453,0.440809
...,...,...,...,...,...,...,...,...,...,...,...
8007,8007,0.364825,0.368260,0.192950,-0.420406,0.261718,-1.096956,0.635431,0.086146,0.375899,-1.167768
8008,8008,0.154980,0.234790,0.210732,0.643738,0.172340,0.896237,0.257927,0.537379,0.126447,0.772337
8009,8009,0.080916,0.101320,0.104040,0.140490,0.261718,-0.100360,0.220177,-0.992982,-1.339082,-0.102733
8010,8010,-0.017834,-0.330495,0.488131,-0.183313,0.172340,-0.515608,0.333428,-0.079255,-0.278912,-0.521155


In [22]:
submit = df_test["id"]
preds

array([5, 6, 3, ..., 2, 0, 5])

In [23]:
submit = df_test["id"]
predicted = {"scribe" : preds}

In [24]:
predicted = pd.DataFrame(predicted)

In [25]:
submit = pd.DataFrame(submit)

In [26]:
submit["scribe"] = predicted

In [27]:
submit["scribe"].replace(to_replace={0 : "Philippus", 1 : "Paithonius", 2 : "Marcus", 3 : "Noaelius", 4 : "Begonius"
                                    , 5 : "Franciscus", 6 : "Ubuntius", 7 : "FerrumEffractarius"}, inplace=True)

In [31]:
submit.set_index("id", inplace=True)

In [32]:
submit.to_csv("./avila/predictions.csv")