In [31]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import ADASYN
data = pd.read_csv('./Data/featuresfloatv3.csv')

# x=data.loc[:,['statuses' , 'date_joined' , 'most_recent_post' , 'following' , 'followers' , 'likes', 'retweet' , 'retweeted_count'  ,'avg_tweets_by_hour_of_day', 'avg_tweets_by_day_of_week']]
x=data.iloc[:, :-1]
y = data.account_type.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

sm = ADASYN(random_state=42)
x_train,y_train = sm.fit_resample(x_train,y_train)

# scale features
# Create an instance of StandardScaler
scaler = StandardScaler()
# Fit the scaler to your data
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# RandomizedSearchCV

In [23]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

Specify the range of values for some important hyperparameters

In [24]:
n_estimators_list = list(range(10,220,50))
criterion_list = ['gini', 'entropy']
max_depth_list = list(range(5,41,10))
max_depth_list.append(None)
min_samples_split_list = [x/1000 for x in list(range(5, 41, 10))]
min_samples_leaf_list = [x/1000 for x in list(range(5, 41, 10))]
max_features_list = ['sqrt', 'log2']
params_grid = {
    'n_estimators': n_estimators_list,
    'criterion': criterion_list,
    'max_depth': max_depth_list,
    'min_samples_split': min_samples_split_list,
    'min_samples_leaf': min_samples_leaf_list,
    'max_features': max_features_list
}

num_combinations = 1
for k in params_grid.keys(): num_combinations *= len(params_grid[k])

print('Number of combinations = ', num_combinations)
params_grid

Number of combinations =  1600


{'n_estimators': [10, 60, 110, 160, 210],
 'criterion': ['gini', 'entropy'],
 'max_depth': [5, 15, 25, 35, None],
 'min_samples_split': [0.005, 0.015, 0.025, 0.035],
 'min_samples_leaf': [0.005, 0.015, 0.025, 0.035],
 'max_features': ['sqrt', 'log2']}

In [25]:
def my_roc_auc_score(model, x, y): return metrics.roc_auc_score(y, model.predict(x))

model_rf = RandomizedSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
                              param_distributions=params_grid,
                              n_iter=100,
                              cv=3,
                              scoring=my_roc_auc_score,
                              return_train_score=True,
                              verbose=2)

model_rf.fit(x,y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=0.015, min_samples_split=0.005, n_estimators=160; total time=   2.2s
[CV] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=0.015, min_samples_split=0.005, n_estimators=160; total time=   2.2s
[CV] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=0.015, min_samples_split=0.005, n_estimators=160; total time=   2.2s
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.035, min_samples_split=0.005, n_estimators=160; total time=   2.5s
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.035, min_samples_split=0.005, n_estimators=160; total time=   2.3s
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.035, min_samples_split=0.005, n_estimators=160; total time=   2.2s
[CV] END criterion=gini, max_depth=5, max_features=log2, min_sample

In [26]:
model_rf.best_params_

{'n_estimators': 110,
 'min_samples_split': 0.005,
 'min_samples_leaf': 0.005,
 'max_features': 'sqrt',
 'max_depth': 35,
 'criterion': 'gini'}

In [27]:


df_cv_results = pd.DataFrame(model_rf.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                           'param_n_estimators', 'param_min_samples_split','param_min_samples_leaf',
                           'param_max_features', 'param_max_depth','param_criterion']]
df_cv_results.sort_values('rank_test_score', inplace=True)
df_cv_results[:20]


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion
53,1,0.845814,0.851212,110,0.005,0.005,sqrt,35.0,gini
68,2,0.845319,0.851127,160,0.005,0.005,sqrt,15.0,entropy
84,3,0.841956,0.846489,210,0.015,0.005,log2,15.0,entropy
87,4,0.841898,0.84814,10,0.005,0.005,log2,,entropy
95,5,0.841268,0.845466,210,0.015,0.005,log2,,gini
59,6,0.84125,0.846532,10,0.005,0.005,log2,35.0,entropy
79,7,0.839963,0.844945,210,0.015,0.005,log2,15.0,gini
54,8,0.83744,0.842544,10,0.015,0.005,log2,15.0,gini
63,9,0.833966,0.83735,210,0.025,0.005,log2,,entropy
74,10,0.833829,0.837065,160,0.025,0.005,sqrt,35.0,entropy


In [28]:

df_cv_results.to_csv('./Data/random_search.csv', index=False)

In [33]:
# {'n_estimators': 110,
#  'min_samples_split': 0.005,
#  'min_samples_leaf': 0.005,
#  'max_features': 'sqrt',
#  'max_depth': 35,
#  'criterion': 'gini'}
model_rf_fin = RandomForestClassifier(class_weight='balanced',
                                      criterion='gini',
                                      max_depth=35,
                                      max_features='sqrt',
                                      min_samples_leaf=0.005,
                                      min_samples_split=0.005,
             
                                      n_estimators=110)
model_rf_fin.fit(x_train,y_train)


y_pred_train = model_rf_fin.predict(x_train)
y_pred_test = model_rf_fin.predict(x_test)
print("\n Training Accuracy score:",metrics.accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",metrics.accuracy_score(y_test, y_pred_test))




 Training Accuracy score: 0.8495012175005892
Testing Accuracy score: 0.8532824203260391
