In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

# Download NLTK resources
nltk.download('stopwords')

# Load the dataset (example dataset, adjust as needed)
df = pd.read_csv('social_media_phishing.csv')

# Preprocess text data
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing to the 'message' column
df['cleaned_message'] = df['message'].apply(preprocess_text)

# Split data into features and labels
X = df['cleaned_message']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [None]:
import tweepy

# Set up Twitter API authentication
consumer_key = 'your_consumer_key'
consumer_secret = 'your_consumer_secret'
access_token = 'your_access_token'
access_token_secret = 'your_access_token_secret'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Function to clean and predict phishing in real-time tweets
def clean_and_predict(tweet):
    cleaned_tweet = preprocess_text(tweet)
    tweet_tfidf = tfidf.transform([cleaned_tweet])
    prediction = model.predict(tweet_tfidf)
    if prediction == 1:
        return "Phishing Alert!"
    else:
        return "Safe"

# Stream tweets based on a keyword or hashtag
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        tweet = status.text
        print(f"New tweet: {tweet}")
        result = clean_and_predict(tweet)
        print(f"Prediction: {result}")

# Start streaming tweets containing the keyword "password"
my_listener = MyStreamListener()
my_stream = tweepy.Stream(auth=api.auth, listener=my_listener)
my_stream.filter(track=["password", "login", "account", "urgent"], languages=["en"])