In [1]:
import pandas as pd

# Load your data
df = pd.read_csv('twitter_bot_final_features.csv')

print("="*70)
print("üîç CHECKING WHICH COLUMNS NEED ENCODING")
print("="*70)

print("\nYour 18 columns:\n")

for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique = df[col].nunique()
    
    if dtype == 'object':  # Text columns
        status = "üìù TEXT - Needs encoding"
    elif dtype == 'bool':  # Boolean (True/False)
        status = "‚úÖ Already numbers (True=1, False=0)"
    else:  # Numbers
        status = "‚úÖ Already numbers"
    
    print(f"{i:2d}. {col:30s} {str(dtype):10s} ({unique:>6,} unique) {status}")

print("\n" + "="*70)
print("SUMMARY")
print("="*70)

text_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nüìù Columns that need encoding: {len(text_cols)}")
for col in text_cols:
    print(f"   ‚Ä¢ {col}")

üîç CHECKING WHICH COLUMNS NEED ENCODING

Your 18 columns:

 1. default_profile                bool       (     2 unique) ‚úÖ Already numbers (True=1, False=0)
 2. default_profile_image          bool       (     2 unique) ‚úÖ Already numbers (True=1, False=0)
 3. description                    object     (29,124 unique) üìù TEXT - Needs encoding
 4. favourites_count               float64    (10,000 unique) ‚úÖ Already numbers
 5. followers_count                int64      ( 5,040 unique) ‚úÖ Already numbers
 6. friends_count                  float64    ( 2,116 unique) ‚úÖ Already numbers
 7. geo_enabled                    bool       (     2 unique) ‚úÖ Already numbers (True=1, False=0)
 8. screen_name                    object     (37,373 unique) üìù TEXT - Needs encoding
 9. statuses_count                 int64      (14,775 unique) ‚úÖ Already numbers
10. verified                       bool       (     2 unique) ‚úÖ Already numbers (True=1, False=0)
11. average_tweets_per_day       

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('twitter_bot_final_features.csv')


# ============================================================
# STEP 1: ENCODE TARGET VARIABLE (account_type)
# ============================================================
print("\n" + "="*70)
print("STEP 1: Encoding Target Variable 'account_type'")
print("="*70)

# Show before
print(f"\nBefore encoding:")
print(df['account_type'].value_counts())

# Encode: bot=0, human=1
label_encoder = LabelEncoder()
df['account_type_encoded'] = label_encoder.fit_transform(df['account_type'])

# Show after
print(f"\nAfter encoding:")
print(df[['account_type', 'account_type_encoded']].drop_duplicates().sort_values('account_type'))

print(f"\nValue counts:")
print(df['account_type_encoded'].value_counts())



STEP 1: Encoding Target Variable 'account_type'

Before encoding:
account_type
human    25005
bot      12420
Name: count, dtype: int64

After encoding:
  account_type  account_type_encoded
0          bot                     0
1        human                     1

Value counts:
account_type_encoded
1    25005
0    12420
Name: count, dtype: int64


In [4]:
# ============================================================
# STEP 2: DROP TEXT COLUMNS (features already extracted)
# ============================================================
print("\n" + "="*70)
print("STEP 2: Dropping Text Columns")
print("="*70)

print("\nReason: We already extracted features from these columns:")
print("  ‚Ä¢ description ‚Üí has_description, description_has_url")
print("  ‚Ä¢ screen_name ‚Üí name_has_numbers")

columns_to_drop = ['description', 'screen_name', 'account_type']

print(f"\nDropping {len(columns_to_drop)} columns:")
for col in columns_to_drop:
    unique = df[col].nunique()
    print(f"  ‚ùå {col:20s} ({unique:>6,} unique values)")

df_encoded = df.drop(columns=columns_to_drop)

print(f"\n‚úì Dropped text columns")


STEP 2: Dropping Text Columns

Reason: We already extracted features from these columns:
  ‚Ä¢ description ‚Üí has_description, description_has_url
  ‚Ä¢ screen_name ‚Üí name_has_numbers

Dropping 3 columns:
  ‚ùå description          (29,124 unique values)
  ‚ùå screen_name          (37,373 unique values)
  ‚ùå account_type         (     2 unique values)

‚úì Dropped text columns


In [5]:
# ============================================================
# STEP 3: CONVERT BOOLEAN TO INTEGER (0/1)
# ============================================================
print("\n" + "="*70)
print("STEP 3: Converting Boolean Columns to Integers")
print("="*70)

bool_cols = df_encoded.select_dtypes(include=['bool']).columns.tolist()

print(f"\nFound {len(bool_cols)} boolean columns:")
for col in bool_cols:
    print(f"  ‚Ä¢ {col}")

# Convert True/False to 1/0
for col in bool_cols:
    df_encoded[col] = df_encoded[col].astype(int)

print(f"\n‚úì Converted: True ‚Üí 1, False ‚Üí 0")


STEP 3: Converting Boolean Columns to Integers

Found 4 boolean columns:
  ‚Ä¢ default_profile
  ‚Ä¢ default_profile_image
  ‚Ä¢ geo_enabled
  ‚Ä¢ verified

‚úì Converted: True ‚Üí 1, False ‚Üí 0


In [6]:
# ============================================================
# STEP 4: VERIFY ALL COLUMNS ARE NUMERIC
# ============================================================
print("\n" + "="*70)
print("STEP 4: FINAL VERIFICATION")
print("="*70)

print(f"\nFinal shape: {df_encoded.shape}")
print(f"\nAll columns and their types:\n")

numeric_count = 0
non_numeric = []

for i, col in enumerate(df_encoded.columns, 1):
    dtype = df_encoded[col].dtype
    unique = df_encoded[col].nunique()
    
    if dtype in ['int64', 'int32', 'float64', 'float32']:
        status = "‚úÖ Numeric"
        numeric_count += 1
    else:
        status = "‚ùå NOT NUMERIC!"
        non_numeric.append(col)
    
    print(f"{i:2d}. {col:30s} {str(dtype):10s} ({unique:>6,} unique) {status}")

print(f"\n" + "="*70)
print("VERIFICATION SUMMARY")
print("="*70)
print(f"‚úÖ Numeric columns: {numeric_count}/{len(df_encoded.columns)}")
print(f"‚ùå Non-numeric columns: {len(non_numeric)}")

if len(non_numeric) == 0:
    print("\nüéâ ALL COLUMNS ARE NUMERIC - READY FOR MACHINE LEARNING!")
else:
    print(f"\n‚ö†Ô∏è Warning: {len(non_numeric)} columns still need encoding:")
    for col in non_numeric:
        print(f"   ‚Ä¢ {col}")

# Check for missing values
missing = df_encoded.isnull().sum().sum()
print(f"\nMissing values: {missing}")


STEP 4: FINAL VERIFICATION

Final shape: (37425, 16)

All columns and their types:

 1. default_profile                int64      (     2 unique) ‚úÖ Numeric
 2. default_profile_image          int64      (     2 unique) ‚úÖ Numeric
 3. favourites_count               float64    (10,000 unique) ‚úÖ Numeric
 4. followers_count                int64      ( 5,040 unique) ‚úÖ Numeric
 5. friends_count                  float64    ( 2,116 unique) ‚úÖ Numeric
 6. geo_enabled                    int64      (     2 unique) ‚úÖ Numeric
 7. statuses_count                 int64      (14,775 unique) ‚úÖ Numeric
 8. verified                       int64      (     2 unique) ‚úÖ Numeric
 9. average_tweets_per_day         float64    (32,159 unique) ‚úÖ Numeric
10. account_age_days               int64      ( 4,158 unique) ‚úÖ Numeric
11. follower_friend_ratio          float64    (20,691 unique) ‚úÖ Numeric
12. tweets_per_follower            float64    (30,438 unique) ‚úÖ Numeric
13. profile_completeness   

In [7]:
# ============================================================
# STEP 5: SAVE ENCODED DATASET
# ============================================================
print("\n" + "="*70)
print("STEP 5: SAVING ENCODED DATASET")
print("="*70)

output_file = 'twitter_bot_encoded.csv'
df_encoded.to_csv(output_file, index=False)

print(f"\n‚úÖ Saved to: {output_file}")
print(f"‚úÖ Rows: {len(df_encoded):,}")
print(f"‚úÖ Columns: {len(df_encoded.columns)}")


STEP 5: SAVING ENCODED DATASET

‚úÖ Saved to: twitter_bot_encoded.csv
‚úÖ Rows: 37,425
‚úÖ Columns: 16


In [8]:
# ============================================================
# STEP 6: CREATE FEATURE/TARGET SPLIT PREVIEW
# ============================================================
print("\n" + "="*70)
print("STEP 6: PREPARING FOR MODELING")
print("="*70)

# Identify target and features
target_col = 'account_type_encoded'
feature_cols = [col for col in df_encoded.columns if col != target_col]

print(f"\nüéØ TARGET VARIABLE (what we predict):")
print(f"   ‚Ä¢ {target_col}")
print(f"     ‚Üí 0 = bot ({(df_encoded[target_col]==0).sum():,} samples)")
print(f"     ‚Üí 1 = human ({(df_encoded[target_col]==1).sum():,} samples)")

print(f"\nüìä FEATURE COLUMNS (what we use to predict): {len(feature_cols)}")
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")


STEP 6: PREPARING FOR MODELING

üéØ TARGET VARIABLE (what we predict):
   ‚Ä¢ account_type_encoded
     ‚Üí 0 = bot (12,420 samples)
     ‚Üí 1 = human (25,005 samples)

üìä FEATURE COLUMNS (what we use to predict): 15
    1. default_profile
    2. default_profile_image
    3. favourites_count
    4. followers_count
    5. friends_count
    6. geo_enabled
    7. statuses_count
    8. verified
    9. average_tweets_per_day
   10. account_age_days
   11. follower_friend_ratio
   12. tweets_per_follower
   13. profile_completeness
   14. description_has_url
   15. name_has_numbers
