# Machine learning based apprpoach for achitecture detection from GitHub repositories [Coach Dataset]

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook is dedicated to data extraction, EDA and classisification

Main libraries used :     
- catboost
- pandas
- sklearn
- interpret
```

If catboost not installed run  `!pip install catboost`

## Extract data from files

In [1]:
import os, glob
import pandas as pd
import numpy as np 

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_data_from_csv(train_path='../Data/coach_data.csv', print_stat=False):
    full_data = pd.read_csv(train_path)

    if print_stat:
        pass
    return full_data

In [3]:
full_data  = get_data_from_csv()
vals = np.unique(full_data.label).tolist()
mapping_labels = dict(zip(vals,np.arange(len(vals)))) 
mapping_labels

{'MVC': 0, 'MVP': 1, 'MVVM': 2, 'NONE': 3}

In [4]:
full_data['label'] = full_data['label'].apply(lambda x : mapping_labels.get(x) )

Xtrain, ytrain = full_data.drop('label',axis=1).values, full_data.label.values

## Define, train and Evaluate ML models

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import cross_val_score

In [6]:
## Cross Validation 
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation 
rf = RandomForestClassifier()
scores = cross_val_score(rf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation CatBoost Classifier
cb = CatBoostClassifier(verbose=False)
scores = cross_val_score(cb, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


## Cross Validation 
ebm = ExplainableBoostingClassifier(random_state=1)
scores = cross_val_score(ebm, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

0.496 F1 with a standard deviation of 0.18 

0.582 F1 with a standard deviation of 0.11 

0.544 F1 with a standard deviation of 0.04 

0.575 F1 with a standard deviation of 0.14 



In [7]:
from sklearn.pipeline import make_pipeline
scaler = StandardScaler()
X_train_std = scaler.fit_transform(Xtrain)


## Cross Validation LR
clf = make_pipeline(StandardScaler(), LogisticRegression())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("neural network")
from sklearn.neural_network import MLPClassifier
## Cross Validation nn
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                hidden_layer_sizes=(5, 2), random_state=1)
clf = make_pipeline(StandardScaler(), nn_clf)
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("KNN")
## Cross Validation 
clf = make_pipeline(StandardScaler(), KNeighborsClassifier())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("SVM")
## Cross Validation SVM
clf = make_pipeline(StandardScaler(), SVC())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("GaussianNB")
## Cross Validation NB
clf = make_pipeline(StandardScaler(), GaussianNB())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

0.539 F1 with a standard deviation of 0.18 

neural network
0.338 F1 with a standard deviation of 0.11 

KNN
0.583 F1 with a standard deviation of 0.11 

SVM
0.523 F1 with a standard deviation of 0.08 

GaussianNB
0.512 F1 with a standard deviation of 0.15 



## Coach + our approah

In [6]:
def get_data_from_csv(train_path='../Data/train.csv', test_path='../Data/test.csv', print_stat=False):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    if print_stat:
        pass
    return train_df, test_df

In [7]:
mapping_labels

{'MVC': 0, 'MVP': 1, 'MVVM': 2, 'NONE': 3}

In [13]:
test_data = full_data.query('label == @mapping_labels.get("MVP") or label == @mapping_labels.get("MVVM")')

mapping_labels2 = {2:'mvp', 1: 'mvvm'}
test_data['label'] = test_data['label'].apply(lambda x : mapping_labels2.get(x))

mapping_labels3 = {"mvp":0,"mvvm":1}
test_data['label'] = test_data['label'].apply(lambda x : mapping_labels3.get(x))

In [21]:
full_data.query('label == @mapping_labels.get("MVP") or label == @mapping_labels.get("MVVM")').shape

(21, 83)

In [14]:
train_df, test_df = get_data_from_csv()
mapping_labels3 = {"mvp":0,"mvvm":1}

train_df['label'] = train_df.label.apply(lambda x : mapping_labels3.get(x))
test_df['label'] = test_df.label.apply(lambda x : mapping_labels3.get(x))


Xtrain, ytrain = train_df.drop('label',axis=1).values, train_df.label.values
Xtest, ytest = test_df.drop('label',axis=1).values, test_df.label.values

Xtrain = np.vstack([Xtrain,Xtest])
ytrain = np.hstack([ytrain,ytest])

Xtest, ytest = test_data.drop('label',axis=1).values, test_data.label.values

In [17]:
print("DecisionTreeClassifier")
dt = DecisionTreeClassifier()
dt.fit(Xtrain, ytrain)
ypred = dt.predict(Xtest)
print(classification_report(ytest, ypred))

## Cross Validation 
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("RandomForestClassifier")
rf = RandomForestClassifier()
rf.fit(Xtrain, ytrain)
ypred = rf.predict(Xtest)
print(classification_report(ytest, ypred))

# feature importance from Random Forest
rf_feature_importance = pd.DataFrame({'feature_importance_rf': rf.feature_importances_, 
                                      'feature_names': train_df.columns[:-1]}).sort_values(by=['feature_importance_rf'], 
                                        ascending=False)

## Cross Validation 
rf = RandomForestClassifier()
scores = cross_val_score(rf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("CatBoostClassifier")
cb = CatBoostClassifier(verbose=False)
cb.fit(Xtrain, ytrain)
ypred = cb.predict(Xtest)
print(classification_report(ytest, ypred))

# feature importance from CatBoost Classifier
feature_importance = pd.DataFrame({'feature_importance_cb': cb.feature_importances_, 
                                      'feature_names': train_df.columns[:-1]}).sort_values(by=['feature_importance_cb'], 
                                        ascending=False)

## Cross Validation CatBoost Classifier
cb = CatBoostClassifier(verbose=False)
scores = cross_val_score(cb, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("ExplainableBoostingClassifier")
ebm = ExplainableBoostingClassifier(random_state=1)
ebm.fit(train_df.drop('label',axis=1), train_df['label'].values)
pred = ebm.predict(Xtest)
print(classification_report(ytest, pred))

# feature importance from Explainable Boosting Classifier
ebm_feature_importance = pd.DataFrame({'feature_importance_ebm': ebm.feature_importances_, 
                                      'feature_names': ebm.feature_names}).sort_values(by=['feature_importance_ebm'], 
                                        ascending=False)

## Cross Validation 
ebm = ExplainableBoostingClassifier(random_state=1)
scores = cross_val_score(ebm, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         9
           1       0.75      0.75      0.75        12

    accuracy                           0.71        21
   macro avg       0.71      0.71      0.71        21
weighted avg       0.71      0.71      0.71        21

0.702 F1 with a standard deviation of 0.01 

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.92      0.92      0.92        12

    accuracy                           0.90        21
   macro avg       0.90      0.90      0.90        21
weighted avg       0.90      0.90      0.90        21

0.801 F1 with a standard deviation of 0.01 

CatBoostClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12

    accuracy               

In [18]:
from sklearn.pipeline import make_pipeline
scaler = StandardScaler()
X_train_std = scaler.fit_transform(Xtrain)
X_test_std = scaler.transform(Xtest)

print("LogisticRegression")
lr = LogisticRegression()
lr.fit(X_train_std, ytrain)
ypred = lr.predict(X_test_std)
print(classification_report(ytest, ypred))

## Cross Validation LR
clf = make_pipeline(StandardScaler(), LogisticRegression())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("neural network")
from sklearn.neural_network import MLPClassifier
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                hidden_layer_sizes=(5, 2), random_state=1)

nn_clf.fit(X_train_std, ytrain)
ypred = nn_clf.predict(X_test_std)
print(classification_report(ytest, ypred))

## Cross Validation nn
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                hidden_layer_sizes=(5, 2), random_state=1)
clf = make_pipeline(StandardScaler(), nn_clf)
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))


print("KNN")
knn = KNeighborsClassifier()
knn.fit(X_train_std, ytrain)
ypred = knn.predict(X_test_std)
print(classification_report(ytest, ypred))

## Cross Validation 
clf = make_pipeline(StandardScaler(), KNeighborsClassifier())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("SVM")
svmclf = SVC()
svmclf.fit(X_train_std, ytrain)
ypred = svmclf.predict(X_test_std)
print(classification_report(ytest, ypred))

## Cross Validation SVM
clf = make_pipeline(StandardScaler(), SVC())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

print("GaussianNB")
gnb = GaussianNB()
y_pred = gnb.fit(X_train_std, ytrain).predict(X_test_std)
print(classification_report(ytest, y_pred))

## Cross Validation NB
clf = make_pipeline(StandardScaler(), GaussianNB())
scores = cross_val_score(clf, Xtrain, ytrain, cv=5, scoring='f1_weighted')
print("%0.3f F1 with a standard deviation of %0.2f \n" % (scores.mean(), scores.std()))

LogisticRegression
              precision    recall  f1-score   support

           0       0.80      0.89      0.84         9
           1       0.91      0.83      0.87        12

    accuracy                           0.86        21
   macro avg       0.85      0.86      0.86        21
weighted avg       0.86      0.86      0.86        21

0.819 F1 with a standard deviation of 0.01 

neural network
              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.92      1.00      0.96        12

    accuracy                           0.95        21
   macro avg       0.96      0.94      0.95        21
weighted avg       0.96      0.95      0.95        21

0.815 F1 with a standard deviation of 0.00 

KNN
              precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       0.85      0.92      0.88        12

    accuracy                           0.86        21
