In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# ============================================================
# STEP 1: LOAD ENCODED DATA
# ============================================================
print("\nSTEP 1: Loading encoded data...")

df = pd.read_csv('twitter_bot_encoded.csv')

print(f"âœ“ Data loaded")
print(f"  Shape: {df.shape}")
print(f"  Rows: {len(df):,}")
print(f"  Columns: {len(df.columns)}")

# ============================================================
# STEP 2: SEPARATE FEATURES (X) AND TARGET (y)
# ============================================================
print("\n" + "="*70)
print("STEP 2: Separating Features and Target")
print("="*70)

# Target variable (what we want to predict)
target_column = 'account_type_encoded'
y = df[target_column]

# Features (everything except target)
X = df.drop(columns=[target_column])

print(f"\nðŸŽ¯ Target variable (y):")
print(f"  Column: {target_column}")
print(f"  Shape: {y.shape}")
print(f"  Values: 0 (bot), 1 (human)")

print(f"\nðŸ“Š Features (X):")
print(f"  Shape: {X.shape}")
print(f"  Number of features: {X.shape[1]}")

print(f"\n  Feature columns:")
for i, col in enumerate(X.columns, 1):
    print(f"    {i:2d}. {col}")

# ============================================================
# STEP 3: CHECK CLASS DISTRIBUTION
# ============================================================
print("\n" + "="*70)
print("STEP 3: Checking Class Distribution")
print("="*70)

print(f"\nOriginal dataset distribution:")
print(y.value_counts().sort_index())

bot_count = (y == 0).sum()
human_count = (y == 1).sum()
bot_percent = (bot_count / len(y)) * 100
human_percent = (human_count / len(y)) * 100

print(f"\n  Bot (0):   {bot_count:>6,} ({bot_percent:>5.2f}%)")
print(f"  Human (1): {human_count:>6,} ({human_percent:>5.2f}%)")
print(f"  Ratio:     {human_count/bot_count:.2f}:1 (Human:Bot)")

# ============================================================
# STEP 4: PERFORM TRAIN-TEST SPLIT
# ============================================================
print("\n" + "="*70)
print("STEP 4: Splitting Data (80% Train, 20% Test)")
print("="*70)

# Split: 80% training, 20% testing
# stratify=y ensures same bot/human ratio in both sets
# random_state=42 makes results reproducible
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20,      # 20% for testing
    random_state=42,     # for reproducibility
    stratify=y           # keep same class ratio
)

print(f"\nâœ“ Split complete")
print(f"\nðŸ“Š Training Set:")
print(f"  Features (X_train): {X_train.shape}")
print(f"  Target (y_train):   {y_train.shape}")
print(f"  Total samples:      {len(X_train):,} (80%)")

print(f"\nðŸ“Š Test Set:")
print(f"  Features (X_test):  {X_test.shape}")
print(f"  Target (y_test):    {y_test.shape}")
print(f"  Total samples:      {len(X_test):,} (20%)")

# ============================================================
# STEP 5: VERIFY STRATIFICATION (same ratio in train/test)
# ============================================================
print("\n" + "="*70)
print("STEP 5: Verifying Stratification")
print("="*70)

print(f"\nTraining set distribution:")
train_bot = (y_train == 0).sum()
train_human = (y_train == 1).sum()
print(f"  Bot (0):   {train_bot:>6,} ({train_bot/len(y_train)*100:>5.2f}%)")
print(f"  Human (1): {train_human:>6,} ({train_human/len(y_train)*100:>5.2f}%)")

print(f"\nTest set distribution:")
test_bot = (y_test == 0).sum()
test_human = (y_test == 1).sum()
print(f"  Bot (0):   {test_bot:>6,} ({test_bot/len(y_test)*100:>5.2f}%)")
print(f"  Human (1): {test_human:>6,} ({test_human/len(y_test)*100:>5.2f}%)")

print(f"\nâœ“ Class ratios are preserved in both sets!")

# ============================================================
# STEP 6: SAVE SPLIT DATA (OPTIONAL)
# ============================================================
print("\n" + "="*70)
print("STEP 6: Saving Split Datasets")
print("="*70)

# Save train set
train_df = X_train.copy()
train_df['account_type_encoded'] = y_train
train_df.to_csv('train_data.csv', index=False)
print(f"âœ“ Saved: train_data.csv ({len(train_df):,} rows)")

# Save test set
test_df = X_test.copy()
test_df['account_type_encoded'] = y_test
test_df.to_csv('test_data.csv', index=False)
print(f"âœ“ Saved: test_data.csv ({len(test_df):,} rows)")

# ============================================================
# FINAL SUMMARY
# ============================================================
print("\n" + "="*70)
print("âœ… TRAIN-TEST SPLIT COMPLETE!")
print("="*70)

print("\nSummary:")
print(f"  â€¢ Original data:     {len(df):>6,} samples")
print(f"  â€¢ Training set:      {len(X_train):>6,} samples (80%)")
print(f"  â€¢ Test set:          {len(X_test):>6,} samples (20%)")
print(f"  â€¢ Features:          {X_train.shape[1]:>6,} columns")
print(f"  â€¢ Stratification:    âœ“ Class ratios preserved")

print("\nFiles created:")
print("  âœ“ train_data.csv - For training models")
print("  âœ“ test_data.csv  - For evaluating models")

print("\nVariables in memory:")
print("  â€¢ X_train - Training features")
print("  â€¢ X_test  - Test features")
print("  â€¢ y_train - Training labels")
print("  â€¢ y_test  - Test labels")

print("\nðŸš€ READY FOR MODEL TRAINING!")
print("="*70)


STEP 1: Loading encoded data...
âœ“ Data loaded
  Shape: (37425, 16)
  Rows: 37,425
  Columns: 16

STEP 2: Separating Features and Target

ðŸŽ¯ Target variable (y):
  Column: account_type_encoded
  Shape: (37425,)
  Values: 0 (bot), 1 (human)

ðŸ“Š Features (X):
  Shape: (37425, 15)
  Number of features: 15

  Feature columns:
     1. default_profile
     2. default_profile_image
     3. favourites_count
     4. followers_count
     5. friends_count
     6. geo_enabled
     7. statuses_count
     8. verified
     9. average_tweets_per_day
    10. account_age_days
    11. follower_friend_ratio
    12. tweets_per_follower
    13. profile_completeness
    14. description_has_url
    15. name_has_numbers

STEP 3: Checking Class Distribution

Original dataset distribution:
account_type_encoded
0    12420
1    25005
Name: count, dtype: int64

  Bot (0):   12,420 (33.19%)
  Human (1): 25,005 (66.81%)
  Ratio:     2.01:1 (Human:Bot)

STEP 4: Splitting Data (80% Train, 20% Test)

âœ“ Split com