# **<span style='color:Yellow'>Getting Data into format </span>**

In [11]:
import pandas as pd
import random
import re
from faker import Faker

fake = Faker()

# A large variety of dummy posts with clean and suspicious content
posts = [
    "Looking to score some 420?", "Got some weed?", "Life is amazing!", "Selling pure MDMA!",
    "Hit me up for some herbs!", "Anyone need Xanax?", "Just chilling at home", 
    "Contact me for top-notch products!", "Fresh herbs available", "I have a variety of pills for sale",
    "Feeling great today!", "CBD oils in stock", "Let's party with some molly!", "Buy legal herbs",
    "Feeling amazing!", "Need some painkillers?", "2 pills ready for delivery", 
    "Chillin' with friends!", "Selling weed in bulk", "Holla at me for the goods!", 
    "Grab some pills from me", "I'm the plug for 420", "Order your goodies now!", 
    "Pure MDMA available", "Get your hands on some Xanax!", "Happy vibes only", 
    "Medicinal herbs on sale", "Hook me up with some CBD", "Feeling the high!", "Need some Molly?", 
    "Best deals on party pills", "Hit me up for the stash", "Stocking up on legal weed!", 
    "Available for quick delivery", "Let's roll with some ecstasy", "Anyone need party stuff?", 
    "Legal CBD products in stock", "Herbs for sale, DM for prices", "Good times ahead!", 
    "Hit me up for premium products", "2 pills left!", "Order 420 now", 
    "Feel the vibe with these pills", "Best prices for medicinal herbs", "Selling CBD oil", 
    "Best place to get your stash!", "420 delivery in 30 mins", "Top quality ecstasy available!", 
    "Best deal on CBD oils", "Buy some pills, message me", "Enjoy your day with 420!", 
    "Hit me up for MDMA!", "Selling weed at low prices", "Need Xanax for the weekend?", 
    "Chillin' with 420!", "Best deals on herbs", "Molly available in bulk", "Contact for good prices!"
]

# Function to generate random IPs (VPN or real)
def generate_ip(vpn=False):
    if vpn:
        return fake.ipv4_private()
    else:
        return fake.ipv4_public()

# Generate a larger dummy dataset
data = []
for i in range(1000):  # Creating 1000 posts
    post = random.choice(posts)
    user_id = fake.uuid4()
    account_creation_date = fake.date_this_decade()
    ip_address = generate_ip(vpn=random.choice([True, False]))  # 50% chance of VPN
    location = fake.city() if not re.match(r'10\.\d+\.\d+\.\d+|192\.\d+\.\d+\.\d+', ip_address) else "Unknown"
    post_datetime = fake.date_time_this_year()
    label = 1 if re.search(r'weed|pills|herbs|mdma|420|xanax|molly|ecstasy|cbd', post, re.I) else 0
    
    data.append([i, user_id, post, account_creation_date, ip_address, location, post_datetime, label])

# Create a DataFrame
df = pd.DataFrame(data, columns=["Post ID", "User ID", "Post Text", "Account Creation Date", "IP Address", "Location", "Post Date/Time", "Label"])

df.head()  # Show the first few rows


Unnamed: 0,Post ID,User ID,Post Text,Account Creation Date,IP Address,Location,Post Date/Time,Label
0,0,d5d1ef80-2e4b-407f-bd3d-af625c8a0903,Selling weed in bulk,2022-02-06,192.168.223.231,Unknown,2024-01-08 09:38:51,1
1,1,d79385db-a2c9-44fb-b78b-5084af18235d,Chillin' with friends!,2023-06-28,10.147.91.174,Unknown,2024-05-10 22:56:53,0
2,2,642dc020-9dce-4662-92c8-71e72d67fd8f,Got some weed?,2021-08-25,172.30.143.35,Wongfort,2024-03-10 04:15:36,1
3,3,96058e51-2841-4f8e-89bc-b25845e06012,Pure MDMA available,2020-12-05,171.125.141.105,Chapmanville,2024-06-08 05:20:45,1
4,4,ba172d71-59b8-441d-ac81-8d964f7fcd0f,Selling CBD oil,2020-12-30,142.96.54.234,East Jeremy,2024-03-22 23:09:17,1


# **<span style='color:Yellow'>Cleaning the Data </span>**

In [12]:
import string

# Cleaning text data
def clean_text(text):
    text = text.lower()  # Lowercase all text
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df['Cleaned Post Text'] = df['Post Text'].apply(clean_text)

# Handling missing locations for VPN users
df['Location'] = df['Location'].replace('Unknown', np.nan)

# Convert dates to pandas datetime
df['Account Creation Date'] = pd.to_datetime(df['Account Creation Date'])
df['Post Date/Time'] = pd.to_datetime(df['Post Date/Time'])

# Check for missing or null values
df.isnull().sum()


Post ID                    0
User ID                    0
Post Text                  0
Account Creation Date      0
IP Address                 0
Location                 350
Post Date/Time             0
Label                      0
Cleaned Post Text          0
dtype: int64

# **<span style='color:Yellow'>Feature Engineering </span>**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Feature Engineering: Vectorize text data
vectorizer = CountVectorizer(stop_words='english', max_features=1000)  # Limit to top 1000 words
X_text = vectorizer.fit_transform(df['Cleaned Post Text'])

# Adding IP address length as a feature (proxy for VPN vs. real IP)
df['IP Length'] = df['IP Address'].apply(lambda x: len(x))

# Create feature matrix (combining text features + other features)
X = pd.concat([pd.DataFrame(X_text.toarray()), df[['IP Length']]], axis=1)
y = df['Label']  # Target variable

# Convert column names to strings
X.columns = X.columns.astype(str)

# **<span style='color:Yellow'>Machine Learning Model </span>**

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      1.00      1.00       225

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



# **<span style='color:Yellow'>Hyperparameter Tuning</span>**

In [15]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'max_depth': None, 'n_estimators': 50}


# **<span style='color:Yellow'>Model Refinement</span>**

In [16]:
# Refining the model with the best parameters
best_rf = RandomForestClassifier(**grid_search.best_params_)
best_rf.fit(X_train, y_train)

# Final evaluation
y_pred_final = best_rf.predict(X_test)
print(classification_report(y_test, y_pred_final))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      1.00      1.00       225

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



# **<span style='color:Yellow'>Dumping Model For Web Deployment</span>**

In [17]:
# Save the model and vectorizer
import joblib 
joblib.dump(best_rf, 'drug_detection_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']