In [23]:
import pandas as pd

# Load the dataset
data = pd.read_csv('toxicity_en.csv')

# Check the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Check the class distribution
print(data['is_toxic'].value_counts())


                                                text is_toxic
0  Elon Musk is a piece of shit, greedy capitalis...    Toxic
1  The senile credit card shrill from Delaware ne...    Toxic
2  He does that a lot -- makes everyone look good...    Toxic
3                                         F*ck Lizzo    Toxic
4  Epstein and trump were best buds!!! Pedophiles...    Toxic
text        0
is_toxic    0
dtype: int64
is_toxic
Toxic        501
Not Toxic    499
Name: count, dtype: int64


In [24]:
# Map labels to binary values
data['label'] = data['is_toxic'].map({'Toxic': 1, 'Not Toxic': 0})

# Basic text preprocessing
import re

def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove non-alphabetic characters and lowercase
    return text

# Apply preprocessing to the text column
data['text'] = data['text'].apply(preprocess)


In [25]:
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [27]:

# Step 2: Save the vectorizer to a file
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer saved.")

# Step 3: Load the vectorizer for future use
vectorizer = joblib.load('tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer loaded.")

TF-IDF Vectorizer saved.
TF-IDF Vectorizer loaded.


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = log_reg.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.79      0.82       104
           1       0.79      0.86      0.83        96

    accuracy                           0.82       200
   macro avg       0.83      0.83      0.82       200
weighted avg       0.83      0.82      0.82       200



In [29]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.72      0.78      0.75       104
           1       0.74      0.68      0.71        96

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.73      0.73      0.73       200



In [30]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer and fit on the training data
vocab_size = 5000  # Limit vocabulary size to 5000 unique words
max_len = 100  # Maximum length of each sequence

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences and pad them to ensure uniform length
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


In [31]:
import joblib

# Save Logistic Regression model (or any other trained model)
joblib.dump(log_reg, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [32]:
import joblib

# Save the Random Forest model to a file
joblib.dump(rf_model, 'random_forest_model.pkl')



['random_forest_model.pkl']

In [33]:
import joblib

# Assuming `tokenizer` is already fitted on your training data
with open('tokenizer.pkl', 'wb') as f:
    joblib.dump(tokenizer, f)

print("Tokenizer saved successfully!")


Tokenizer saved successfully!


In [35]:
# Step 1: Import Libraries
import joblib  # For loading models
import numpy as np  # For handling arrays
from keras.models import load_model 
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Step 2: Load the Models
log_reg_model = joblib.load('logistic_regression_model.pkl')
print("Logistic Regression Model Loaded")

random_forest_model = joblib.load('random_forest_model.pkl')
print("Random Forest Model Loaded")

# Load the Tokenizer and Vectorizer
tokenizer = joblib.load('tokenizer.pkl')  # Load your tokenizer here
print("Tokenizer Loaded")

# Load the vectorizer used during model training (this should match what was used)
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Assuming you saved the vectorizer
print("Vectorizer Loaded")

# Step 4: Define a Function to Make Predictions
def predict(text):
    # Get features for Logistic Regression and Random Forest
    vectorized_features = vectorizer.transform([text]).toarray()  # Get the feature vector
    log_reg_prediction = log_reg_model.predict(vectorized_features)
    random_forest_prediction = random_forest_model.predict(vectorized_features)

    # Combine predictions for final output
    final_prediction = log_reg_prediction[0] or random_forest_prediction[0]

    return {
        "log_reg": log_reg_prediction[0],
        "random_forest": random_forest_prediction[0],
        "final_prediction": final_prediction
    }

# Step 5: Input Loop for User Interaction
while True:
    text_input = input("Enter your text (or 'exit' to quit): ")
    if text_input.lower() == 'exit':
        break

    predictions = predict(text_input)
    
    # Step 6: Display Predictions
    print("\nModel Predictions:")
    print(f"Logistic Regression: {'Toxic' if predictions['log_reg'] else 'Not Toxic'}")
    print(f"Random Forest: {'Toxic' if predictions['random_forest'] else 'Not Toxic'}")


Logistic Regression Model Loaded
Random Forest Model Loaded
Tokenizer Loaded
Vectorizer Loaded


Enter your text (or 'exit' to quit):  fuck



Model Predictions:
Logistic Regression: Toxic
Random Forest: Toxic


Enter your text (or 'exit' to quit):  Hello IBM



Model Predictions:
Logistic Regression: Not Toxic
Random Forest: Not Toxic


Enter your text (or 'exit' to quit):  exit


In [36]:
from sklearn2pmml import PMMLPipeline
from sklearn2pmml import sklearn2pmml

# Step 9: Convert Logistic Regression and Random Forest models to PMML
def convert_sklearn_models_to_pmml():
    # Logistic Regression PMML conversion
    log_reg_pipeline = PMMLPipeline([("classifier", log_reg_model)])  # Creating a pipeline for Logistic Regression
    sklearn2pmml(log_reg_pipeline, "logistic_regression_model.pmml", with_repr=True)
    print("Logistic Regression Model converted to PMML")

    # Random Forest PMML conversion
    rf_pipeline = PMMLPipeline([("classifier", random_forest_model)])  # Creating a pipeline for Random Forest
    sklearn2pmml(rf_pipeline, "random_forest_model.pmml", with_repr=True)
    print("Random Forest Model converted to PMML")

# Call the functions to convert the models
convert_sklearn_models_to_pmml()


Logistic Regression Model converted to PMML
Random Forest Model converted to PMML
