In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import joblib

In [5]:
#Reading the data
df = pd.read_csv('toxic_comments.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184354 entries, 0 to 184353
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   is_offensive  184354 non-null  int64 
 1   text          184350 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.8+ MB


In [62]:
#Checking for missing values
missing_in_offensive = df[df['is_offensive'].isna() & ~df['text'].isna()]
missing_in_text = df[~df['is_offensive'].isna() & df['text'].isna()]

print(f"Rows missing data in 'is_offensive' but have data in 'text':\n{missing_in_offensive}")
print(f"Rows missing data in 'text' but have data in 'is_offensive':\n{missing_in_text}")

Rows missing data in 'is_offensive' but have data in 'text':
Empty DataFrame
Columns: [is_offensive, text]
Index: []
Rows missing data in 'text' but have data in 'is_offensive':
        is_offensive text
102193             0  NaN
122526             0  NaN
154264             0  NaN
170295             0  NaN


In [63]:
#Droping rows with missing values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 184350 entries, 0 to 184353
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   is_offensive  184350 non-null  int64 
 1   text          184350 non-null  object
dtypes: int64(1), object(1)
memory usage: 4.2+ MB


In [64]:
#Preprocessing the text
def preprocess_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    stop_words = stopwords.words('english')
    review_without_stop_words = ' '.join([word for word in text.split() if word not in stop_words])
    
    stemmer = SnowballStemmer("english")
    review_stemmed = ' '.join([stemmer.stem(word) for word in review_without_stop_words.split()])
    return review_stemmed

In [65]:
df_list = []
for text in df['text']:
    df_list.append(text)

df_preprocessed = []
for text in df_list:
    df_preprocessed.append(preprocess_text(text))

  text = BeautifulSoup(text, 'html.parser').get_text()


In [66]:
#Creating a bag of words
vectorizer = CountVectorizer(max_features=10000)
#Fiting and transforming the data
feature = vectorizer.fit_transform(df_preprocessed).toarray()
print(feature.shape)

(184350, 10000)


In [67]:
#Spliting the data
X_train, X_test, y_train, y_test = train_test_split(feature, df['is_offensive'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (132732, 10000) (132732,)
Validation set: (14748, 10000) (14748,)
Test set: (36870, 10000) (36870,)


In [68]:
#Training the model
model = LinearSVC(dual=True, max_iter=10000)
model.fit(X_train, y_train)

validation_score = model.score(X_val, y_val)
print("Validation accuracy:", validation_score)



Validation accuracy: 0.949891510713317


In [69]:
#Testing the model
new_text = "chineese people are dirty and disgusting"
proccesed = preprocess_text(new_text)
new_features = vectorizer.transform([proccesed])
prediction = model.predict(new_features)
print(f"Probability of being profane: {prediction}")

Probability of being profane: [1]


In [70]:
# Method to classify sentences
def classify_sentences(sentences, vectorizer, model):
    text_preprocessed = []
    for text in sentences:
        text_preprocessed.append(preprocess_text(text))
    
    vectors = vectorizer.transform(text_preprocessed).toarray()
    predictions = model.predict(vectors)
    
    for sentence, prediction in zip(sentences, predictions):
        profanity = "Profanity detected" if prediction == 1 else "No profanity detected"
        print(f"Sentence: {sentence}")
        print(f"  - Predicted Profanity: {profanity} (Probability: {prediction})")

# Test Example sentences
input_sentences = [
    "Wow! What an incredible movie! The plot was engaging",
    "I absolutely loved this food!",
    "Great job on the project, everyone!",
    "Thank you for your hard work!",

    "I hate Europeans, they are idiots",
    "Americans are the worst, they are so stupid",
    "I hate white people, they are so dirty",
    "Are you mental insane or something?",
    "Go back to your country, mother fucker",
    "Americans are dirty and disgusting"
]

classify_sentences(input_sentences, vectorizer, model)

Sentence: Wow! What an incredible movie! The plot was engaging
  - Predicted Profanity: No profanity detected (Probability: 0)
Sentence: I absolutely loved this food!
  - Predicted Profanity: No profanity detected (Probability: 0)
Sentence: Great job on the project, everyone!
  - Predicted Profanity: No profanity detected (Probability: 0)
Sentence: Thank you for your hard work!
  - Predicted Profanity: No profanity detected (Probability: 0)
Sentence: I hate Europeans, they are idiots
  - Predicted Profanity: Profanity detected (Probability: 1)
Sentence: Americans are the worst, they are so stupid
  - Predicted Profanity: Profanity detected (Probability: 1)
Sentence: I hate white people, they are so dirty
  - Predicted Profanity: Profanity detected (Probability: 1)
Sentence: Are you mental insane or something?
  - Predicted Profanity: Profanity detected (Probability: 1)
Sentence: Go back to your country, mother fucker
  - Predicted Profanity: Profanity detected (Probability: 1)
Sentence

In [71]:
# Saving model to joblib file
model_folder = '../src/mlservice/service/utils'
joblib.dump(model, f'{model_folder}/linear_svc_model.joblib')
joblib.dump(vectorizer, f'{model_folder}/count_vectorizer.joblib')


['../../src/mlservice/service/ml_models/count_vectorizer.joblib']

In [72]:
# Testing joblib file
model = joblib.load(f'{model_folder}/linear_svc_model.joblib')
vectorizer = joblib.load(f'{model_folder}/count_vectorizer.joblib')
new_text = "chineese people are dirty and disgusting"
prep_text = preprocess_text(new_text)
new_feat = vectorizer.transform([prep_text])
prediction = model.predict(new_feat)
print("Prediction:", prediction)

Prediction: [1]
