<a href="https://colab.research.google.com/github/VisIonBlurred/profanity_check/blob/main/Profanity_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
dataset = pd.read_csv("English_profanity_words.csv")
dataset.dropna(inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'English_profanity_words.csv'

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

In [None]:
dataset['clean_text'] = dataset['text'].apply(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X = vectorizer.fit_transform(dataset['clean_text'])
y = dataset['is_offensive']

In [None]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
pipeline = Pipeline([
    ('classifier', CalibratedClassifierCV(LinearSVC()))
])

In [None]:
param_grid = {
    'classifier__estimator__C': [0.1, 1, 10],  # Changed from 'classifier__base_estimator__C'
    'classifier__cv': [3, 5]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=3, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
def predict_custom_text(text, model, vectorizer):
    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t not in stopwords.words("english")]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        return " ".join(tokens)

    processed_text = preprocess_text(text)

    text_features = vectorizer.transform([processed_text])

    prediction = model.predict(text_features)

    return "Offensive" if prediction[0] == 1 else "Not Offensive"




In [None]:
predict_custom_text("going on for way too long", best_model,vectorizer)