# Desicion Tree with K-fold cross validation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

Defining functions that are used within the notebook.

In [14]:
# Feature importance and selection
def plot_feature_importance(df_train, df_test, max_tree_depth, title='Feature importance'):
    clf = DecisionTreeClassifier(max_depth=max_tree_depth)
    X = df_train
    y = df_test
    clf = clf.fit(X, y)

    feature_names = df_train.columns
    feature_importance_df = pd.DataFrame(list(zip(clf.feature_importances_, feature_names)), columns=["feature_importance", "feature_name"])
    feature_importance_df = feature_importance_df.sort_values(by='feature_importance', ascending=False)
    useless_features = list(feature_importance_df[feature_importance_df['feature_importance'] == 0]['feature_name'])
    feature_importance_df = feature_importance_df[feature_importance_df['feature_importance'] != 0]

    fig = px.bar(feature_importance_df, x="feature_name", y="feature_importance", log_y=True, title=title)
    fig.show(width=900, height=500)
    

    print("The following features were dropped:")
    print(useless_features)
    return useless_features
   

# K-fold cross validation
def cross_validation(model, _X, _y, _cv):
      '''Function to perform 10 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=10
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

# Grouped Bar Chart for both training and validation data
def plot_result(x_label, y_label, plot_title, train_data, val_data):
        '''Function to plot a grouped bar chart showing the training and validation
          results of the ML model in each fold after applying K-fold cross-validation.
         Parameters
         ----------
         x_label: str, 
            Name of the algorithm used for training e.g 'Decision Tree'
          
         y_label: str, 
            Name of metric being visualized e.g 'Accuracy'
         plot_title: str, 
            This is the title of the plot e.g 'Accuracy Plot'
         
         train_result: list, array
            This is the list containing either training precision, accuracy, or f1 score.
        
         val_result: list, array
            This is the list containing either validation precision, accuracy, or f1 score.
         Returns
         -------
         The function returns a Grouped Barchart showing the training and validation result
         in each fold.
        '''
        
        # Set size of plot
        plt.figure(figsize=(12,6))
        labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold"]
        X_axis = np.arange(len(labels))
        ax = plt.gca()
        plt.ylim(0.40000, 1)
        plt.bar(X_axis-0.2, train_data, 0.4, color='blue', label='Training')
        plt.bar(X_axis+0.2, val_data, 0.4, color='red', label='Validation')
        plt.title(plot_title, fontsize=30)
        plt.xticks(X_axis, labels)
        plt.xlabel(x_label, fontsize=14)
        plt.ylabel(y_label, fontsize=14)
        plt.legend()
        plt.grid(True)
        plt.show()

# putting it all together
def fold_classify_and_plot(df_train, df_test, max_tree_depth, title='Feature importance'):
    clf = DecisionTreeClassifier(max_depth=max_tree_depth)
    X = df_train
    y = df_test
    clf = clf.fit(X, y)

    feature_names = df_train.columns
    feature_importance_df = pd.DataFrame(list(zip(clf.feature_importances_, feature_names)), columns=["feature_importance", "feature_name"])
    feature_importance_df = feature_importance_df.sort_values(by='feature_importance', ascending=False)
    useless_features = list(feature_importance_df[feature_importance_df['feature_importance'] == 0]['feature_name'])
    feature_importance_df = feature_importance_df[feature_importance_df['feature_importance'] != 0]

    fig = px.bar(feature_importance_df, x="feature_name", y="feature_importance", log_y=True, title=title)
    fig.show(width=900, height=500)
    

    print("The following features were dropped:")
    print(useless_features)
    return useless_features

### Data Pre-processing

First we load our dataset, change the categorical "lang" column to numbers, then take a look at our columns.

In [3]:
df = pd.read_csv("dataset/users_df_dataset_cleaned_with_indicators.csv")

In [4]:
var = sorted(df['lang'].unique())
var

['Select Language...',
 'ar',
 'da',
 'de',
 'el',
 'en',
 'en-AU',
 'en-GB',
 'es',
 'fil-PH',
 'fr',
 'id',
 'it',
 'ja',
 'ko',
 'nl',
 'pl',
 'pt',
 'ru',
 'sv',
 'tr',
 'xx-lc',
 'zh-CN',
 'zh-TW']

In [5]:
mapping = dict(zip(var, range(0, len(var) + 1)))
df['lang_num'] = df['lang'].map(mapping).astype(int)
df.drop(columns='lang', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11109 entries, 0 to 11108
Data columns (total 33 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   user_id                                   11109 non-null  int64  
 1   name                                      11108 non-null  object 
 2   bot                                       11109 non-null  int64  
 3   created_at                                11109 non-null  object 
 4   statuses_count                            11109 non-null  int64  
 5   account_age_in_days                       11109 non-null  int64  
 6   number_of_tweets                          11109 non-null  int64  
 7   account_average_tweets_per_day            11109 non-null  float64
 8   avg_tweets_per_actual_day                 11109 non-null  float64
 9   day_with_most_tweets                      11109 non-null  object 
 10  max_number_of_tweets_in_a_day     

Dropping non-numerical and non-categorical columns from our dataset, as they are of less interest too us when using the Desicion Tree classification model.

In [6]:
columns_to_drop = ["user_id", "name", "created_at", "day_with_most_tweets"]
df.drop(columns=columns_to_drop, inplace=True)
label = df.pop("bot")

Lets take a look at our values.

In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
statuses_count,11109.0,486.128364,1240.441,0.0,41.0,68.0,81.0,7226.0
account_age_in_days,11109.0,1827.599874,555.0895,891.0,1312.0,1702.0,2080.0,3901.0
number_of_tweets,11109.0,941.636241,1199.367,1.0,67.0,132.0,1703.0,3668.0
account_average_tweets_per_day,11109.0,0.262958,0.6808833,0.0,0.02247191,0.03508772,0.058267,6.792821
avg_tweets_per_actual_day,11109.0,13.376648,37.97705,1.0,1.846154,4.777778,8.058511,1759.5
max_number_of_tweets_in_a_day,11109.0,52.172383,87.94356,1.0,6.0,18.0,75.0,2131.0
entropy_for_day,11109.0,2.795355,1.460038,0.0,1.426345,2.921107,3.710199,5.706424
entropy_for_hour,11109.0,1.995941,1.086827,0.0,1.032784,2.234517,2.789275,5.705947
entropy_for_minute,11109.0,1.017305,0.6785934,0.0,0.4854608,0.7215676,1.856961,4.017732
avg_hashtags,11109.0,0.156578,0.2333411,0.0,0.05,0.08571429,0.173913,6.44105


Seeing as some of our values differ by many orders of magnitude, we must standardizen our data. Choosing the standardscaler for this job, and only scaling non-categorical data.

In [8]:
categoric_col_names = df.columns.values.tolist()
del categoric_col_names[:-1]
numeric_col_names = df.columns.values.tolist()
del numeric_col_names[-1]

In [9]:
standard_scaler = StandardScaler().fit(df[numeric_col_names])

df[numeric_col_names] = standard_scaler.transform(df[numeric_col_names])

In [10]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
statuses_count,11109.0,-1.3431810000000001e-17,1.000045,-0.391917,-0.358863,-0.337096,-0.326615,5.433691
account_age_in_days,11109.0,-9.434247000000001e-17,1.000045,-1.687371,-0.928901,-0.22628,0.454722,3.735422
number_of_tweets,11109.0,-2.750323e-17,1.000045,-0.784313,-0.729281,-0.675083,0.634833,2.273271
account_average_tweets_per_day,11109.0,6.3961e-18,1.000045,-0.386219,-0.353213,-0.334684,-0.300639,9.590714
avg_tweets_per_actual_day,11109.0,6.8758070000000005e-18,1.000045,-0.325913,-0.303631,-0.226433,-0.140042,45.980453
max_number_of_tweets_in_a_day,11109.0,-7.67532e-18,1.000045,-0.581904,-0.525047,-0.388589,0.259583,23.639265
entropy_for_day,11109.0,8.826618000000001e-17,1.000045,-1.914663,-0.937696,0.086134,0.626618,1.993922
entropy_for_hour,11109.0,-1.624609e-16,1.000045,-1.836567,-0.88625,0.219526,0.729987,3.413765
entropy_for_minute,11109.0,-7.291554000000001e-17,1.000045,-1.499206,-0.783781,-0.43583,1.237403,4.421738
avg_hashtags,11109.0,-2.6863620000000002e-17,1.000045,-0.671058,-0.456769,-0.303706,0.074293,26.933767


### Feature Selection / Dimensionality reduction

Now that our data is standardized, we can take a look at what features provide the most information to our desicionmaking process.

In [11]:
useless_features = plot_feature_importance(df, label, max_tree_depth=10)

The following features were dropped:
['mean_inactive_period_length_in_seconds']


['mean_inactive_period_length_in_seconds']

In [12]:
useless_features = plot_feature_importance(df, label, max_tree_depth=5)

The following features were dropped:
['total_replies', 'total_likes', 'avg_text_length', 'account_discussion_creation_ratio', 'entropy_for_minute', 'entropy_for_day', 'max_number_of_tweets_in_a_day', 'avg_tweets_per_actual_day', 'mean_inactive_period_length_in_seconds', 'mode_inactive_period_length_in_seconds', 'mode_count', 'statuses_count']


With a shallower desicion tree, we can see that more features will be dropped.

## Applying the Decision tree classification method and the K-fold cross validation

We split the dataset into K-folds. Starting with a typical K of 10.

['total_replies',
 'total_likes',
 'avg_text_length',
 'account_discussion_creation_ratio',
 'entropy_for_minute',
 'entropy_for_day',
 'max_number_of_tweets_in_a_day',
 'avg_tweets_per_actual_day',
 'mean_inactive_period_length_in_seconds',
 'mode_inactive_period_length_in_seconds',
 'mode_count',
 'statuses_count']