In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-human-bots-dataset/twitter_human_bots_dataset.csv


In [2]:
# Prepare your file
parent_dir: str = os.path.join('/kaggle', 'input', 'twitter-human-bots-dataset')
dataset_name: str = "twitter_human_bots_dataset.csv"
dataset_path: str = os.path.join(parent_dir, dataset_name)
print(f"Dataset directory: {dataset_path}")

# Generate a Pandas DataFrame
dataset: pd.DataFrame = pd.read_csv(dataset_path, index_col=0)
print(f"Dataset shape {dataset.shape}")

# Take a look to the Data
print(f"Dataset columns: {dataset.columns}")
dataset.head()



Dataset directory: /kaggle/input/twitter-human-bots-dataset/twitter_human_bots_dataset.csv
Dataset shape (37438, 19)
Dataset columns: Index(['created_at', 'default_profile', 'default_profile_image', 'description',
       'favourites_count', 'followers_count', 'friends_count', 'geo_enabled',
       'id', 'lang', 'location', 'profile_background_image_url',
       'profile_image_url', 'screen_name', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'],
      dtype='object')


Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,2016-10-15 21:32:11,False,False,"Blame @xaiax, Inspired by @MakingInvisible, us...",4,1589,4,False,787405734442958848,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/7874121826...,best_in_dumbest,11041,False,7.87,1403,bot
1,2016-11-09 05:01:30,False,False,Photographing the American West since 1980. I ...,536,860,880,False,796216118331310080,en,Estados Unidos,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8023296328...,CJRubinPhoto,252,False,0.183,1379,human
2,2017-06-17 05:34:27,False,False,Scruffy looking nerf herder and @twitch broadc...,3307,172,594,True,875949740503859204,en,"Los Angeles, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1278890453...,SVGEGENT,1001,False,0.864,1159,human
3,2016-07-21 13:32:25,True,False,Wife.Godmother.Friend.Feline Fanatic! Assistan...,8433,517,633,True,756119643622735875,en,"Birmingham, AL",,http://pbs.twimg.com/profile_images/1284884924...,TinkerVHELPK5,1324,False,0.889,1489,human
4,2012-01-15 16:32:35,False,False,Loan coach at @mancity & Aspiring DJ,88,753678,116,True,464781334,en,"England, United Kingdom",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9952566258...,JoleonLescott,4202,True,1.339,3138,human


In [3]:
dataset.dropna(subset=['location','profile_image_url'], axis=0, inplace=True)

dataset['description']=dataset['description'].notnull().astype(int)
dataset['profile_background_image_url']= dataset['profile_background_image_url'].notnull().astype(int)

In [4]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split



In [5]:
# Feature Scaling
scaler = MinMaxScaler()
numerical_features = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'account_age_days']
dataset[numerical_features] = scaler.fit_transform(dataset[numerical_features])

In [6]:
dataset.columns

Index(['created_at', 'default_profile', 'default_profile_image', 'description',
       'favourites_count', 'followers_count', 'friends_count', 'geo_enabled',
       'id', 'lang', 'location', 'profile_background_image_url',
       'profile_image_url', 'screen_name', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'],
      dtype='object')

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dataset['default_profile'] = label_encoder.fit_transform(dataset['default_profile'])
dataset['default_profile_image'] = label_encoder.fit_transform(dataset['default_profile_image'])
dataset['geo_enabled'] = label_encoder.fit_transform(dataset['geo_enabled'])
dataset['verified'] = label_encoder.fit_transform(dataset['verified'])


In [8]:
relevant_features = ['default_profile', 'default_profile_image', 'description',
       'favourites_count', 'followers_count', 'friends_count', 'geo_enabled',
       'profile_background_image_url',
       'profile_image_url', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'] # Add more features based on your analysis
data_selected = dataset[relevant_features]

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.models import Model

# Define the input shape and noise dimension
noise_dim=126
input_shape = (noise_dim,)  # noise_dim is the dimension of the noise vector
num_features = len(relevant_features)-2  # Number of selected features

# Define the generator architecture
def build_generator(input_shape):
    input_layer = Input(shape=input_shape)
    
    # Fully connected layers
    x = Dense(128)(input_layer)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    
    # Output layer
    output_layer = Dense(num_features, activation='linear')(x)  # Linear activation for numerical features
    
    return Model(input_layer, output_layer)

# Build the generator model
generator = build_generator(input_shape)
generator.summary()


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 126)]             0         
                                                                 
 dense (Dense)               (None, 128)               16256     
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 128)               0         
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 256)               0         
                                                             

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU

input_shape_dis=(12,)
# Define the discriminator architecture
def build_discriminator(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    
    return model

# Build the discriminator model
discriminator = build_discriminator(input_shape_dis)
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
discriminator.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 128)               1664      
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 9,985
Trainable params: 9,985
Non-trainable params: 0
_________________________________________________________________


In [11]:
data_selected

Unnamed: 0,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,profile_background_image_url,profile_image_url,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,0,0,1,0.000005,1.306298e-05,9.210096e-07,0,1,http://pbs.twimg.com/profile_images/7874121826...,0.003983,0,7.870,0.196791,bot
1,0,0,1,0.000606,7.069957e-06,2.026221e-04,0,1,http://pbs.twimg.com/profile_images/8023296328...,0.000091,0,0.183,0.191658,human
2,0,0,1,0.003736,1.413991e-06,1.367699e-04,1,1,http://pbs.twimg.com/profile_images/1278890453...,0.000361,0,0.864,0.144599,human
3,1,0,1,0.009527,4.250195e-06,1.457498e-04,1,0,http://pbs.twimg.com/profile_images/1284884924...,0.000478,0,0.889,0.215187,human
4,0,0,1,0.000099,6.195897e-03,2.670928e-05,1,1,http://pbs.twimg.com/profile_images/9952566258...,0.001516,1,1.339,0.567914,human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37433,1,0,1,0.000735,1.142702e-06,2.544289e-04,0,1,http://pbs.twimg.com/profile_images/5844684414...,0.000123,0,0.084,0.758289,human
37434,0,0,1,0.009986,9.219602e-03,1.393027e-04,1,1,http://pbs.twimg.com/profile_images/9511349456...,0.009008,1,8.976,0.491765,human
37435,1,0,1,0.000451,7.039540e-04,4.374796e-05,0,1,http://pbs.twimg.com/profile_images/1174733822...,0.002227,1,2.226,0.489840,human
37436,0,0,1,0.001093,1.134481e-06,3.822190e-05,1,1,http://pbs.twimg.com/profile_images/1265807908...,0.000354,0,0.339,0.516791,human


In [12]:
data_selected.drop(columns='account_type', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_selected.drop(columns='account_type', inplace=True)


In [13]:
X_train=data_selected


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dataset['account_type'] = label_encoder.fit_transform(dataset['account_type'])

In [15]:
y_train=dataset['account_type']

In [16]:
X_train.drop(columns='profile_image_url', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns='profile_image_url', inplace=True)


In [17]:
batch_size = 38  # You can adjust based on your memory capacity
num_epochs = 50  # Start with a reasonable number and adjust as needed

# Train the discriminator
discriminator.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7da2caefbac0>

In [18]:
X_train.values[np.random.randint(0, X_train.shape[0], batch_size)]

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 9.35802143e-03,
        2.30842321e-05, 4.63728339e-04, 1.00000000e+00, 1.00000000e+00,
        1.78050514e-02, 0.00000000e+00, 1.19940000e+01, 7.76898396e-01],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.09674475e-03,
        1.59485080e-06, 1.18119483e-04, 1.00000000e+00, 1.00000000e+00,
        1.52854891e-03, 0.00000000e+00, 1.49800000e+00, 5.01604278e-01],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.55910534e-04,
        3.28835216e-08, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        9.59627116e-05, 0.00000000e+00, 1.84000000e-01, 2.05775401e-01],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.11227479e-02,
        3.91757834e-04, 1.60025420e-04, 1.00000000e+00, 1.00000000e+00,
        5.65855313e-03, 1.00000000e+00, 7.35000000e+00, 3.53155080e-01],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.93688900e-04,
        2.98417958e-06, 3.99487919e-04, 1.00000000e+00, 1.00

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

# Define GAN hyperparameters
noise_dim = 126  # Dimension of the noise vector
batch_size = 64
num_epochs = 700


# Build the GAN
discriminator.trainable = False  # Freeze discriminator during GAN training
gan_input = tf.keras.layers.Input(shape=(noise_dim,))
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0002, beta_1=0.5))

# Training loop
for epoch in range(num_epochs):
    for _ in range(batch_size):
        # Train discriminator
        real_data = X_train.values[np.random.randint(0, X_train.shape[0], batch_size)]
        generated_data = generator.predict(np.random.rand(batch_size, noise_dim))
        discriminator_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
        discriminator_loss_generated = discriminator.train_on_batch(generated_data, np.zeros((batch_size, 1)))
        discriminator_loss = 0.5 * np.add(discriminator_loss_real, discriminator_loss_generated)
        
        # Train generator
        noise = np.random.rand(batch_size, noise_dim)
        generator_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))
        
        print(f"Epoch: {epoch}, Disc Loss: {discriminator_loss[0]}, Gen Loss: {generator_loss}")


Epoch: 0, Disc Loss: 2.295705407857895, Gen Loss: 166.45785522460938
Epoch: 0, Disc Loss: 1.2856257259845734, Gen Loss: 150.31248474121094
Epoch: 0, Disc Loss: 1.9466807544231415, Gen Loss: 151.85206604003906
Epoch: 0, Disc Loss: 2.4633578956127167, Gen Loss: 128.17047119140625
Epoch: 0, Disc Loss: 1.5512919425964355, Gen Loss: 137.60372924804688
Epoch: 0, Disc Loss: 1.1148696541786194, Gen Loss: 111.75584411621094
Epoch: 0, Disc Loss: 0.4785310924053192, Gen Loss: 121.10173034667969
Epoch: 0, Disc Loss: 0.1496475338935852, Gen Loss: 112.0515365600586
Epoch: 0, Disc Loss: 0.16490855533629656, Gen Loss: 100.10647583007812
Epoch: 0, Disc Loss: 0.10122919408604503, Gen Loss: 89.88333892822266
Epoch: 0, Disc Loss: 0.08012364199385047, Gen Loss: 107.33030700683594
Epoch: 0, Disc Loss: 0.08552390546537936, Gen Loss: 115.37974548339844
Epoch: 0, Disc Loss: 0.10641582624521106, Gen Loss: 94.74995422363281
Epoch: 0, Disc Loss: 0.084164765663445, Gen Loss: 107.47816467285156
Epoch: 0, Disc Loss:

In [20]:
# Generate synthetic data
num_samples_to_generate = 1000  # Number of synthetic data samples to generate

# Generate noise
noise = np.random.rand(num_samples_to_generate, noise_dim)

# Generate synthetic data using the generator
generated_data = generator.predict(noise)

# Post-process the generated data if needed (e.g., scale back to original range)
# generated_data = ...  # Apply reverse preprocessing

# Now you have the generated synthetic data ready to be used




In [21]:
generated_data[0]

array([2.0750068e-02, 3.9901361e-03, 9.8506689e-01, 2.1108121e-02,
       3.8021253e-03, 1.3242653e-03, 6.1940634e-01, 9.9158931e-01,
       1.4155250e-02, 9.8078048e-01, 1.1347284e+01, 6.1491144e-01],
      dtype=float32)