Name: Anurag Mishra


Batch : 05 February Artificial Intelligence Batch


Project conditions:
1. Collecting and preprocessing a dataset of cyberbully tweets.
2. Implementing feature extraction techniques, such as Bag of Words or TF-IDF.
3. Selecting and training a machine learning model for tweet classification.
4. Evaluating the model's performance using appropriate metrics.
6. Optionally exploring real-time detection capabilities and addressing ethical considerations.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
file_path = "/content/drive/MyDrive/Blend Vidya Internship Assignment/cyberbullying_tweets.csv"
data = pd.read_csv(file_path)

In [None]:
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [None]:
print("\nMissing values:")
print(data.isnull().sum())


Missing values:
tweet_text            0
cyberbullying_type    0
dtype: int64


In [None]:
print("\nDistribution of cyberbullying_type:")
print(data['cyberbullying_type'].value_counts())


Distribution of cyberbullying_type:
cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64


In [None]:
data['clean_text'] = data['tweet_text'].str.replace('[^a-zA-Z\s]', '').str.lower()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['cyberbullying_type'], test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
print("\nShape of feature vectors:")
print("Training set:", X_train_tfidf.shape)
print("Testing set:", X_test_tfidf.shape)


Shape of feature vectors:
Training set: (38153, 1000)
Testing set: (9539, 1000)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
model = MultinomialNB()

First We are going to try it with MultiNomial NB and see the accuracy score

In [None]:
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                     precision    recall  f1-score   support

                age       0.80      0.96      0.87      1603
          ethnicity       0.90      0.89      0.89      1603
             gender       0.81      0.79      0.80      1531
  not_cyberbullying       0.60      0.45      0.51      1624
other_cyberbullying       0.59      0.53      0.56      1612
           religion       0.79      0.95      0.86      1566

           accuracy                           0.76      9539
          macro avg       0.75      0.76      0.75      9539
       weighted avg       0.75      0.76      0.75      9539



In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[1537    1    6   34   20    5]
 [  38 1433    7   20   49   56]
 [  23   27 1206  143   98   34]
 [ 166   55  110  729  396  168]
 [ 163   75  152  242  847  133]
 [   6    9   12   40   16 1483]]


In [None]:
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Accuracy Score:
0.7584652479295524


Since Multinomial Naive Bayes algorithms gave only an accuracy of only 0.75 we must test it with other ones as well to see if we could achieve better results

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

In [None]:
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [None]:
for name, model in models.items():
    print(f"Training and evaluating {name}...")
    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('model', model)
    ])

Training and evaluating Multinomial Naive Bayes...
Training and evaluating Support Vector Machine...
Training and evaluating Logistic Regression...
Training and evaluating Random Forest...
Training and evaluating Gradient Boosting...


In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test), multi_class='ovr')


In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print('-' * 50)

Accuracy: 0.8247
Precision: 0.8433
Recall: 0.8247
F1-score: 0.8241
ROC AUC: 0.9665
--------------------------------------------------


Accuracy: 0.8247
Accuracy measures the proportion of correctly classified instances out of the total number of instances. An accuracy of 0.8247 indicates that approximately 82.47% of the instances were correctly classified by the model.


Precision: 0.8433
Precision measures the proportion of true positive predictions out of all positive predictions made by the model. A precision of 0.8433 indicates that approximately 84.33% of the instances predicted as positive were truly positive.


Recall: 0.8247
Recall, also known as sensitivity or true positive rate, measures the proportion of true positive predictions out of all actual positive instances. A recall of 0.8247 indicates that approximately 82.47% of all actual positive instances were correctly identified by the model.


F1-score: 0.8241
F1-score is the harmonic mean of precision and recall, providing a balanced measure of a model's performance. A higher F1-score indicates better overall performance. An F1-score of 0.8241 suggests a good balance between precision and recall.


ROC AUC: 0.9665
ROC AUC (Receiver Operating Characteristic Area Under the Curve) measures the area under the ROC curve, which represents the trade-off between true positive rate (sensitivity) and false positive rate. A higher ROC AUC score indicates better discrimination between positive and negative classes. A score of 0.9665 suggests that the model has excellent discriminatory power.

In [None]:
joblib.dump(model, "trained_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [None]:
!apt install redis


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libjemalloc2 liblua5.1-0 liblzf1 lua-bitop lua-cjson redis-server redis-tools
Suggested packages:
  ruby-redis
The following NEW packages will be installed:
  libjemalloc2 liblua5.1-0 liblzf1 lua-bitop lua-cjson redis redis-server redis-tools
0 upgraded, 8 newly installed, 0 to remove and 45 not upgraded.
Need to get 1,276 kB of archives.
After this operation, 5,793 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libjemalloc2 amd64 5.2.1-4ubuntu1 [240 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 liblua5.1-0 amd64 5.1.5-8.1build4 [99.9 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 liblzf1 amd64 3.6-3 [7,444 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 lua-bitop amd64 1.0.2-5 [6,680 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy/u

In [None]:
!pip install tweepy




In [None]:
import tweepy
print(tweepy.__version__)


4.14.0


In [None]:
from flask import Flask
import tweepy
import joblib

# Load the trained model and TF-IDF vectorizer
model = joblib.load("trained_model.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Set up Twitter API credentials
consumer_key = 'your_consumer_key'
consumer_secret = 'your_consumer_secret'
access_token = 'your_access_token'
access_token_secret = 'your_access_token_secret'

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# Create a Flask app
app = Flask(__name__)

# Define a route for the home page
@app.route('/')
def home():
    return 'Welcome to Real-Time Cyberbully Detection!'

# Define a route for real-time detection
@app.route('/realtime-detection', methods=['GET'])
def realtime_detection():
    # Stream tweets in real-time using the Twitter API
    class MyStreamListener(tweepy.StreamListener):
        def on_status(self, status):
            tweet_text = status.text
            # Preprocess the tweet text
            clean_text = preprocess_tweet(tweet_text)
            # Vectorize the tweet text
            vectorized_text = tfidf_vectorizer.transform([clean_text])
            # Make a prediction using the model
            prediction = model.predict(vectorized_text)[0]
            # Alert if cyberbullying is detected
            if prediction == 'cyberbullying':
                alert_moderator(tweet_text)
            return True

    my_stream_listener = MyStreamListener()
    my_stream = tweepy.Stream(auth=api.auth, listener=my_stream_listener)
    my_stream.filter(track=['cyberbully', 'cyberbullying'], is_async=True)
    return 'Real-time detection has started.'

def preprocess_tweet(tweet_text):
    return tweet_text

def alert_moderator(tweet_text):
    print("Cyberbullying detected:", tweet_text)

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True, port=903)



 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:903
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
