In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
import os
current_directory = os.getcwd()
files = os.listdir(current_directory)
print(files)


['.ipynb_checkpoints', 'NLP', 'profanity_filter.ipynb', 'Profanity_filter_model.joblib', 'profanity_words']


In [3]:
df= pd.read_csv('profanity_words/English_profanity_words.csv')

In [4]:
df.head()

Unnamed: 0,is_offensive,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \n\nHi Alexik...
2,1,Dis hoe wasnt dis violent on Lottery Ticket ðŸ˜‚ðŸ˜‚
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


nltk.download('wordnet')

In [5]:
 df= df.sample(20000)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 145949 to 118262
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_offensive  20000 non-null  int64 
 1   text          20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 468.8+ KB


Text Preprocessing
1. Remove special characters
2. lowercasing
3. Remove stop words
4. Remove stop words
5. Tokenization
6. Lemmatization

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    clean_text = ' '.join(tokens)#text ma token lai join garya
    return clean_text

In [9]:
df['text'] = df['text'].astype(str).apply(clean_text)

In [10]:
df['is_offensive'] = df['is_offensive'].astype(bool)

In [11]:
df.head()

Unnamed: 0,is_offensive,text
145949,False,government openly poved guess could neutralize...
57024,False,child adult oriented think mention omsi childr...
120292,True,donnerkay surlyrevenant enraged lb mike brown ...
154052,True,lmfaoooooo im glad shorty aint twitter ig tryn...
85086,True,happy valentine day big booty bitch


Training the model

In [12]:
X= df['text']
y= df['is_offensive']

In [13]:
X

145949    government openly poved guess could neutralize...
57024     child adult oriented think mention omsi childr...
120292    donnerkay surlyrevenant enraged lb mike brown ...
154052    lmfaoooooo im glad shorty aint twitter ig tryn...
85086                   happy valentine day big booty bitch
                                ...                        
158636    well one came along defend sentence ill assume...
10111     reply posted clark peer review page think good...
35297                         ip note use endless ip number
45837     also im concerned block technical sense shorta...
118262    please add nonsense wikipedia considered vanda...
Name: text, Length: 20000, dtype: object

In [14]:
y


145949    False
57024     False
120292     True
154052     True
85086      True
          ...  
158636    False
10111     False
35297     False
45837     False
118262    False
Name: is_offensive, Length: 20000, dtype: bool

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 2)

In [16]:
X_train.shape

(16000,)

In [17]:
X_test.shape


(4000,)

tF- IDF is used for vectorization

In [18]:
vectorizer = TfidfVectorizer()

In [19]:
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()

In [20]:
X_train_vectorized.shape

(16000, 44850)

In [21]:
X_test_vectorized.shape

(4000, 44850)

In [22]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [23]:
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=int(1e5))
cclf = CalibratedClassifierCV(model)
cclf.fit(X_train_vectorized, y_train)


In [25]:
y_predict = cclf.predict(X_test_vectorized)

In [26]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score =accuracy_score(y_test,y_predict)
print(accuracy_score)

0.949


In [27]:
from joblib import dump

dump(cclf, 'Profanity_filter_model.joblib')

['Profanity_filter_model.joblib']

In [28]:
from joblib import load

loaded_model = load('Profanity_filter_model.joblib')

In [29]:
text = ["Hey girl! If you donot complete a task by EOD I will kill you",
        "I already had my breakfast",
        "you are a dumbass",
        "She is such a bitch"]

In [30]:

tx = vectorizer.transform(text)
tx = loaded_model.predict(tx)


In [31]:
tx

array([ True, False,  True,  True])