In [1]:
### Prepare the package and data we need, for this part I use Yimeng's data since it contains labels directly
import pandas as pd
import xgboost as xgb

path = '../../data/initial_data.csv'
df = pd.read_csv(path, header=0, index_col=False)
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj.com,dga
1,wi-wamss.org,benign
2,qcxfurnkbqidxxcl.biz,dga
3,192-168-1-1-admin.ru,benign
4,dblsiobnkjxomkmh.ru,dga


In [4]:
### You can diy this part yourself since I am just write examples
def generate_feature(df):
    # I use the length of domain name as a feature
    df['feature'] = df['domain'].apply(lambda x :len(x))
    return df
df = generate_feature(df)
df.head()

Unnamed: 0,domain,domain_type,feature
0,wisuolycossttqrj.com,dga,20
1,wi-wamss.org,benign,12
2,qcxfurnkbqidxxcl.biz,dga,20
3,192-168-1-1-admin.ru,benign,20
4,dblsiobnkjxomkmh.ru,dga,19


In [5]:
### Normally machine learning model can not handle text data, so we need to transform our label to [0,1]
def data_transformation(df):
    df['domain_type'] = df['domain_type'].apply(lambda x: 1 if x == 'benign' else 0)
    return df
df = data_transformation(df)
df.head()

Unnamed: 0,domain,domain_type,feature
0,wisuolycossttqrj.com,0,20
1,wi-wamss.org,1,12
2,qcxfurnkbqidxxcl.biz,0,20
3,192-168-1-1-admin.ru,1,20
4,dblsiobnkjxomkmh.ru,0,19


In [6]:
### create our train and test data, the feature set will not include the original_domain
X = df['feature']
y = df['domain_type']

In [18]:
### The random_state is a parameter controling the randomness of train and test splitting, ignore it
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

In [21]:
### train our model, this part is similar to what models in sklearn do, there are other ways they provided but I choose the way you are familar
import numpy as np
X_train = np.reshape(X_train,(-1,1))
X_test = np.reshape(X_test,(-1,1))
xgbmodel = xgb.XGBClassifier().fit(X_train, y_train)
predictions = xgbmodel.predict(X_test)

In [23]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, predictions, average=None)

(array([0.92582418, 1.        ]),
 array([1.        , 0.91809909]),
 array([0.96148359, 0.957301  ]),
 array([2022, 1978], dtype=int64))