In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from tkinter import filedialog
from tkinter import Tk
import PySimpleGUI as sg





# Load your dataset
df = pd.read_csv('cyberbullying_tweets.csv')

# Display the first few rows
print(df.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [3]:
# Features (X) and target (y)
X = df['tweet_text']  # Input text
y = df['cyberbullying_type']  # Target labels


In [4]:
# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [6]:
# Convert text to numerical features using TF-IDF with n-grams
# Reduce max_features and ensure sparse matrix
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), dtype='float32')  # Use unigrams and bigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)




In [7]:
# Ensure the matrices are in sparse format
X_train_tfidf = X_train_tfidf.tocsr()
X_test_tfidf = X_test_tfidf.tocsr()

In [8]:
# Train an XGBoost model
model = XGBClassifier(random_state=42)
model.fit(X_train_tfidf, y_train)



In [9]:
# Evaluate the model
y_pred = model.predict(X_test_tfidf)
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

In [10]:
# Print results
print("Accuracy:", accuracy_score(y_test_decoded, y_pred_decoded))
print("Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

Accuracy: 0.8306950414089527
Classification Report:
                      precision    recall  f1-score   support

                age       0.98      0.98      0.98      1603
          ethnicity       1.00      0.98      0.99      1603
             gender       0.91      0.81      0.86      1531
  not_cyberbullying       0.64      0.48      0.55      1624
other_cyberbullying       0.57      0.80      0.67      1612
           religion       0.96      0.94      0.95      1566

           accuracy                           0.83      9539
          macro avg       0.84      0.83      0.83      9539
       weighted avg       0.84      0.83      0.83      9539



In [10]:
eJytJtMVaRWRNPljbKn1NclyVIHXlNw9ZcScIq6uIWksR2lkd1m8VmsGba3XBolZchiUIWs7IPkwxxpzYZ27VauCcH25VSJaR5CeIs6QM7Tocoz6ObT6cp3YMNDTQz53NBCewbi3TWG9ltj2ZgWn54zPZ7UgRWlKcfGexqv4ebW11JlAbHnhRmWaZMXOJqzMaUWV9UuGIjjVo9xNL7CEJsOGYRWp1glzRkmDlQyBcM3dQUiGO0iVJdOkY5Ww5hkBaBW45spWITi5wCi7TEmdF1tPZGU2xhhtcg3UQXitOFiUJBQrYVXDRyo7YWWvsyizLtC4JPDPbb2B1zwZYbWj5q50IAjzojijIYi1wgi8Qj3eVFzrdcGh9NtPZWXFJaJXR7CdII6GIljgUA2wOdDmIayeIxiiwaifRQG2F70fZVUdlmzcc73rVPldZ3CYIw6YIpjPAUyYLMzcEt3JLpzgIywjMnjLUSiTLuC0JPESYLXqRwl1RVXHhhwPakXWJ0lhcNysI76fIajnAPySLEzhE93wLNztIxw4MMj4YjiSLDCJJ3FfbbWlFcpzbQEYFlklZaHUJYlHcq3NM4iQOQiSJOujYZWP54kJaGWo5aprcdGLFn0iaOGfFKrrNITIUd5jQnGkdVtCYhWUl8sPLxmUNZvzb4SHIZsFIvkGlFQKQhWKRdklcHm4VYzIcbylIM6HIcj6E2w7Nmi04lxiO6TpMnujMWTLg1xzLejKUXivfoQ8=V=r0c40a65e140af43d854dd8a314ea6cf03c83b34f713f57173ffaf454068a92b09f38c91aa1124353149a818b82195c9a2dcac180177895b976550cd261bee8b8c4907512dd74fb8e53d0d48a66d1781e0d21c03e0ab96ee60e272024eb78f1432fa9eee4b00f24d34254875b353bbb53c1a1c5e5c05f0449504d9e9a2ebee6b2261c6e40463dbcc6271bd117d32c096a843514b45e50a7448afe1bf85c24947fbc138cd319b064e9ed1a37d899f67ab0205799a1b25c80da527d8501ea4783bc10a5a0c9eb0ef38c741962851d2b0dcfbf61ce78fde7910b7109654ab6ddc326cbf1a86a1aa26f7e9ff4de2457002bf96353cb9812ea647ff137334eb662be6562a40b4f52070a72b64d7bd3e3fe07e2eb173bbefd137d0deff53914166d0dc9ff73bd02bc5339b96dd7872ac0758975fc9f86b4e3ab31dd2e1ddd0b04bcba4786c4ef57d238f35da7d63ab7c9b7a4c6d61353771e047c73fe25f502ef222f283357231996675347a872f9c91e020d46c96c0e5f2e01636c9ce4fc76b48e01f52a7dc61ed30eef5de7e7a24c7aabf25b964c84064db0e3bb3254a1e7b474080e7ce60c1984db29097148d041b0e4f13c6a2ed817655e05794e8d558619fa51a0766e0140ed4dc535137575e44ed92179e115e0a61060184650dab7417cddaa33ff8dd9b49eec727ff78d49bad9839b00d3c16d80c2a9f5b5c81fd7d6383012f7

In [1]:
pip install pysimplegui

Collecting pysimplegui
  Downloading PySimpleGUI-5.0.8-py3-none-any.whl.metadata (6.6 kB)
Collecting rsa (from pysimplegui)
  Downloading rsa-4.9-py3-none-any.whl.metadata (4.2 kB)
Collecting pyasn1>=0.1.3 (from rsa->pysimplegui)
  Downloading pyasn1-0.6.1-py3-none-any.whl.metadata (8.4 kB)
Downloading PySimpleGUI-5.0.8-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ------------------- -------------------- 0.5/1.1 MB 1.2 MB/s eta 0:00:01
   ---------------------------- ----------- 0.8/1.1 MB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 1.1/1.1 MB 1.0 MB/s eta 0:00:00
Downloading rsa-4.9-py3-none-any.whl (34 kB)
Downloading pyasn1-0.6.1-py3-none-any.whl (83 kB)
Installing collected packages: pyasn1, rsa, pysimplegui
Successfully installed pyasn1-0.6.1 pysimplegui-5.0.8 rsa-4.9
Note

In [15]:
# Function to create a circular bar chart with labels and percentages
def create_circular_bar_chart(data, labels):
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(aspect="equal"))
    wedges, texts, autotexts = ax.pie(data, wedgeprops=dict(width=0.5), startangle=-40, autopct='%1.1f%%', textprops=dict(color="w"))
    
    # Add labels
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"))
    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = f"angle,angleA=0,angleB={ang}"
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                    horizontalalignment=horizontalalignment, bbox=bbox_props, **kw)
    
    ax.set_title("Distribution of Predicted Cyberbullying Types")
    plt.show()

In [11]:
# Interactive loop for user input
while True:
    user_choice = input("Choose an option:\n1. Predict cyberbullying type for a single tweet\n2. Analyze a dataset\n3. Exit\nEnter your choice: ")
    
    if user_choice == '1':
        # Ask the user to input text
        user_input = input("Enter a tweet to predict its cyberbullying type (or type 'exit' to quit): ")
        
        # Exit the loop if the user types 'exit'
        if user_input.lower() == 'exit':
            print("Exiting the program. Goodbye!")
            break
        
        # Transform the user input into TF-IDF features
        user_input_tfidf = tfidf.transform([user_input])
        
        # Predict the cyberbullying type
        prediction = model.predict(user_input_tfidf)
        
        # Decode the prediction
        predicted_label = label_encoder.inverse_transform(prediction)[0]
        
        # Print the result
        print(f"Predicted cyberbullying type: {predicted_label}\n")
    
    elif user_choice == '2':
        # Create a file dialog to select the dataset file
        dataset_path = sg.popup_get_file("Select Dataset File", file_types=(("CSV Files", "*.csv"),))
        
        if not dataset_path:
            print("No file selected. Returning to main menu.")
            continue
      


         # Load the dataset with specified data types
        try:
            dtype_spec = {'tweet_text': str}
            new_df = pd.read_csv(dataset_path, dtype=dtype_spec)
            # Define new_X
            new_X = new_df['tweet_text']
            
            # Transform the new dataset into TF-IDF features
            new_X_tfidf = tfidf.transform(new_X)
            
            # Predict the cyberbullying types for the new dataset
            new_predictions = model.predict(new_X_tfidf)
            new_predictions_decoded = label_encoder.inverse_transform(new_predictions)
            
            # Count the occurrences of each predicted cyberbullying type
            prediction_counts = pd.Series(new_predictions_decoded).value_counts()
            
            # Create a circular bar chart with labels and percentages
            create_circular_bar_chart(prediction_counts.values, prediction_counts.index)
        
        except Exception as e:
            print(f"Error loading or processing the dataset: {e}")
    
    elif user_choice == '3':
        print("Exiting the program. Goodbye!")
        break
    
    else:
        print("Invalid choice. Please choose a valid option.")

Choose an option:
1. Predict cyberbullying type for a single tweet
2. Analyze a dataset
3. Exit
Enter your choice:  3


Exiting the program. Goodbye!


In [12]:
import joblib

# Assuming 'model' is your trained model, 'tfidf' is your TF-IDF vectorizer, and 'label_encoder' is your label encoder
joblib.dump(model, 'hack2_model.pkl')
joblib.dump(tfidf, 'hack2_tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']