# Machine learning based apprpoach for achitecture detection from GitHub repositories [Coach Dataset]

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook is dedicated to data extraction, EDA and classisification

Main libraries used :     
- catboost
- pandas
- sklearn
- interpret
```

If catboost not installed run  `!pip install catboost`

## Extract data from files

In [1]:
import os, glob
import pandas as pd
import numpy as np 

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_data_from_csv(train_path='../Data/coach_data.csv', print_stat=False):
    full_data = pd.read_csv(train_path)

    if print_stat:
        pass
    return full_data

In [3]:
full_data  = get_data_from_csv()
vals = np.unique(full_data.label).tolist()
mapping_labels = dict(zip(vals,np.arange(len(vals)))) 
mapping_labels

{'MVC': 0, 'MVP': 1, 'MVVM': 2, 'NONE': 3}

In [4]:
full_data['label'].apply(lambda x : mapping_labels.get(x) )

0     3
1     0
2     0
3     2
4     1
     ..
64    0
65    3
66    1
67    0
68    0
Name: label, Length: 69, dtype: int64

In [5]:
full_data['label'] = full_data['label'].apply(lambda x : mapping_labels.get(x) )

Xtrain, ytrain = full_data.drop('label',axis=1).values, full_data.label.values

## Define, train and Evaluate ML models

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import cross_val_score

In [9]:
## Cross Validation 
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation 
rf = RandomForestClassifier()
scores = cross_val_score(rf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation CatBoost Classifier
cb = CatBoostClassifier(verbose=False)
scores = cross_val_score(cb, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation 
ebm = ExplainableBoostingClassifier(random_state=1)
scores = cross_val_score(ebm, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

0.434 F1 with a standard deviation of 0.10 

0.529 F1 with a standard deviation of 0.10 

0.544 F1 with a standard deviation of 0.04 

0.575 F1 with a standard deviation of 0.14 



In [10]:
from sklearn.pipeline import make_pipeline
scaler = StandardScaler()
X_train_std = scaler.fit_transform(Xtrain)


## Cross Validation LR
clf = make_pipeline(StandardScaler(), LogisticRegression())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("neural network")
from sklearn.neural_network import MLPClassifier
## Cross Validation nn
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                hidden_layer_sizes=(5, 2), random_state=1)
clf = make_pipeline(StandardScaler(), nn_clf)
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("KNN")
## Cross Validation 
clf = make_pipeline(StandardScaler(), KNeighborsClassifier())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("SVM")
## Cross Validation SVM
clf = make_pipeline(StandardScaler(), SVC())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("GaussianNB")
## Cross Validation NB
clf = make_pipeline(StandardScaler(), GaussianNB())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

0.539 F1 with a standard deviation of 0.18 

neural network
0.338 F1 with a standard deviation of 0.11 

KNN
0.583 F1 with a standard deviation of 0.11 

SVM
0.523 F1 with a standard deviation of 0.08 

GaussianNB
0.512 F1 with a standard deviation of 0.15 

