## Import dataset

In [None]:
import pandas as pd

train_data_df = pd.read_excel('./QICC-Bots-DataSet.xlsx', sheet_name='Labelled_Data')
test_data_df = pd.read_excel('./QICC-Bots-DataSet.xlsx', sheet_name='Test_Data')

all_data_df = pd.concat([train_data_df, test_data_df], ignore_index=True)

In [None]:
print(test_data_df.shape)
print(train_data_df.shape)
print(all_data_df.shape)

In [None]:
train_data_df.iloc[0]

## Features

In [None]:
import emoji

In [None]:
# replace rare app values with the word 'other'
d = all_data_df['App'].value_counts() >= 2
all_data_df['App'] = [i if d[i] else 'other' for i in all_data_df['App']]

In [None]:
all_data_df["Diff_tweetdate_createddate"] = all_data_df["Date"] = all_data_df["User Since"]

In [None]:
all_data_df["has_link"] = all_data_df["Link(s)"].apply(lambda x: 0 if x == '' else 1)

In [None]:
all_data_df["has_media"] = all_data_df["Media"].apply(lambda x: 0 if x == '' else 1)

In [None]:
all_data_df["is_verified"] = all_data_df["Verfied"].apply(lambda x: 0 if x == '' else 1)

In [None]:
all_data_df["has_location"] = all_data_df["Location_1"].apply(lambda x: 0 if x == '' else 1)

In [None]:
all_data_df["has_bio"] = all_data_df["Bio"].apply(lambda x: 0 if x == '' else 1)

In [None]:
all_data_df["has_website"] = all_data_df["Website"].apply(lambda x: 0 if x == '' else 1)

In [None]:
import re
all_data_df["num_hashtags"] = all_data_df["Tweet Text"].apply(lambda x: len(re.findall(r"#(\w+)", x)))

In [None]:
all_data_df["has_quote"] = all_data_df["Tweet Text"].apply(lambda x: len(re.findall(r"(?<=[ء-ي]\.)(.*)(?=:)", x))>0)

In [None]:
all_data_df["has_:"] = all_data_df["Tweet Text"].apply(lambda x: len(re.findall(r":", x))>0)

In [None]:
def count_emojis(string):
  count = 0
  for c in string:
    if emoji.is_emoji(c):
        count += 1
  return count


all_data_df["num_enojis"] = all_data_df["Tweet Text"].apply(count_emojis)

In [None]:
all_data_df["num_enojis"]

In [None]:
all_data_df["FollowerstoFollows_Ratio"] = all_data_df["Follows"] / all_data_df["Followers"] 

In [None]:
import numpy as np
all_data_df["FollowerstoFollows_Ratio"] = all_data_df["FollowerstoFollows_Ratio"].fillna(-1).replace(np.inf, -2)

In [None]:
all_data_features = all_data_df[["Diff_tweetdate_createddate","has_link","has_media","is_verified","has_location","has_bio","has_website","num_hashtags","has_quote","has_:","num_enojis","FollowerstoFollows_Ratio","Retweets","Favorites","Listed",0.181742536,"Followers","Follows"]]

In [None]:
all_data_features.keys()

In [None]:
all_data_features_with_labels = all_data_features.join(all_data_df["Is_Bot ( 1 for Bot / 0 for Human)"])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,8))
sns.heatmap(all_data_features_with_labels.corr(), cmap='coolwarm', annot=True)
plt.tight_layout()
plt.show()

In [None]:
all_data_features_after_corr = all_data_features.drop(columns=["has_location","Follows",0.181742536,"FollowerstoFollows_Ratio"])

In [None]:
train_features=all_data_features_after_corr.loc[0:len(train_data_df),]
test_features=all_data_features_after_corr.loc[len(train_data_df):,]

train_label = all_data_df.loc[0:len(train_data_df),"Is_Bot ( 1 for Bot / 0 for Human)"]
test_label = all_data_df.loc[len(train_data_df):,"Is_Bot ( 1 for Bot / 0 for Human)"]

In [None]:
!mkdir data
all_data_df.to_csv("./data/all_data.csv")
all_data_features_after_corr.to_csv("./data/all_data_features_after_corr.csv")
all_data_features.to_csv("./data/all_data_features.csv")
train_features.to_csv("./data/train_features.csv")
test_features.to_csv("./data/test_features.csv")
train_label.to_csv("./data/train_label.csv")
test_label.to_csv("./data/test_label.csv")

## Training

In [None]:
from sklearn.metrics import precision_score, recall_score , f1_score , classification_report,precision_recall_fscore_support,accuracy_score

def train_model(classifier, feature_vector_train, label, feature_vector_valid,valid_y, epochs=1, show_report=True, print_pred=False):

    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if print_pred:
      print(predictions)
    if show_report:
      print(classification_report(valid_y,predictions))
    return accuracy_score(valid_y, predictions) , precision_recall_fscore_support(valid_y,predictions)

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
import xgboost

## Logistic Regression

In [None]:
accuracy , prfs = train_model(linear_model.LogisticRegression(), train_features, train_label, test_features,test_label,print_pred=False)
print ("NB, Count Vectors: ", accuracy)

## Random Forest

In [None]:
model=ensemble.RandomForestClassifier(n_estimators=10000,criterion='gini',max_features=None)
accuracy , prfs = train_model(model, train_features, train_label, test_features,test_label,print_pred=False)
print ("NB, Count Vectors: ", accuracy)

In [None]:
import pickle
pickle.dump(model, open("RandomForest.model", 'wb'))

In [None]:
loaded_model = pickle.load(open("RandomForest.model", 'rb'))

In [None]:
predictions = loaded_model.predict(test_features)
print(classification_report(test_label,predictions))

In [None]:
Saccuracy, prfs = train_model(svm.SVC(kernel='rbf',decision_function_shape ='ovo',cache_size=1024), train_features, train_label, test_features,test_label,print_pred=False)


## Ensemble search

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

rf1 = RandomForestClassifier(n_estimators=10000, criterion='entropy', n_jobs=-1, random_state=17, max_features=None)
rf2 = RandomForestClassifier(n_estimators=10000, criterion='gini', n_jobs=-1, random_state=17, max_features=None)
adb = AdaBoostClassifier(n_estimators=10000)
gbc = GradientBoostingClassifier(random_state=17)
xgb = XGBClassifier(seed=17)
eclf = VotingClassifier(estimators=[('rf1', rf1), ('rf2', rf2), ('adb', adb), ('gbc', gbc), ('xgb', xgb)], voting='soft', weights=[1, 1, 1, 1, 1])

eclf.fit(train_features, train_label)



In [None]:
y_val_pred = eclf.predict_proba(test_features)
print('\nensemble performance:')
print('f1 score: ', f1_score(test_label, np.argmax(y_val_pred, axis=1), average="macro"))
print('precision score:', precision_score(test_label, np.argmax(y_val_pred, axis=1), average="macro"))
print('recall score:', recall_score(test_label, np.argmax(y_val_pred, axis=1), average="macro"))
print('accuracy score:', accuracy_score(test_label, np.argmax(y_val_pred, axis=1)))
print(classification_report(test_label, np.argmax(y_val_pred, axis=1), target_names=['human', 'bot']))

In [None]:
import pickle
pickle.dump(eclf, open("VotingClassifier.model", 'wb'))

In [None]:
loaded_model = pickle.load(open("VotingClassifier.model", 'rb'))

In [None]:
y_val_pred2 = loaded_model.predict(test_features)

In [None]:
print('\n Random Forest Only performance:')
print('f1 score: ', f1_score(test_label, y_val_pred2, average="macro"))
print('precision score:', precision_score(test_label, y_val_pred2, average="macro"))
print('recall score:', recall_score(test_label, y_val_pred2, average="macro"))
print('accuracy score:', accuracy_score(test_label, y_val_pred2))
print(classification_report(test_label, y_val_pred2, target_names=['human', 'bot']))

## Hyper Feature Search


In [None]:
from itertools import combinations
from tqdm import tqdm_notebook as tqdm
count=0
best_features=[]
best_f1_score=0
train_label = all_data_df.loc[0:len(train_data_df),"Is_Bot ( 1 for Bot / 0 for Human)"]
test_label = all_data_df.loc[len(train_data_df):,"Is_Bot ( 1 for Bot / 0 for Human)"]

for j in tqdm(range(0,len(all_data_features_after_corr.keys()))):
    for i in tqdm(list(combinations(all_data_features_after_corr.keys(),j))):
        #print(i)
        selected_datafeatures= all_data_features_after_corr.drop(columns=list(i))

        train_features=selected_datafeatures.loc[0:len(train_data_df),]
        test_features=selected_datafeatures.loc[len(train_data_df):,]

        model_gini=ensemble.RandomForestClassifier(n_estimators=1000,criterion='gini',max_features=None,n_jobs=8)
        model_entropy=ensemble.RandomForestClassifier(n_estimators=1000,criterion='entropy',max_features=None,n_jobs=8)

        model_gini.fit(train_features, train_label)
        model_entropy.fit(train_features, train_label)

        predictions_gini = model_gini.predict(test_features)
        predictions_entropy = model_entropy.predict(test_features)

        f1_gini = f1_score(test_label,predictions_gini)
        f1_entropy = f1_score(test_label,predictions_entropy)
        #raise TypeError
        if f1_gini>best_f1_score:
            best_f1_score=f1_gini
            best_features=i
            best_model=model_gini
            print(selected_datafeatures.keys())
            print(classification_report(test_label,predictions_gini))

        if f1_entropy>best_f1_score:
            best_f1_score=f1_entropy
            best_features=i
            best_model=model_entropy
            print(selected_datafeatures.keys())
            print(classification_report(test_label,predictions_entropy))
    
    

