In [2]:
import pandas as pd
from sklearn.svm import SVC
import numpy as np
import string

In [3]:
df = pd.read_csv("train.csv")
df = df.drop('id',axis=1)
df.head(0)


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate


In [4]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = "".join([char for char in text if char not in string.punctuation])
    # Additional preprocessing steps can be added here
    return text

In [5]:
df['comment_text'] = df['comment_text'].apply(preprocess_text)

# Create the 'slang' column based on the presence of toxic labels
df['slang'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].any(axis=1).astype(int)

# Separate the features (X) and target (y)
X = df['comment_text']
y = df['slang']
# df.head(0)
print("Preprocessed comment_text:")
print(X.head())
print("\nSlang labels:")
print(y.head())

Preprocessed comment_text:
0    explanation\nwhy the edits made under my usern...
1    daww he matches this background colour im seem...
2    hey man im really not trying to edit war its j...
3    \nmore\ni cant make any real suggestions on im...
4    you sir are my hero any chance you remember wh...
Name: comment_text, dtype: object

Slang labels:
0    0
1    0
2    0
3    0
4    0
Name: slang, dtype: int32


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
x_train, x_test,y_train, y_test = train_test_split(X,y, test_size=0.4,random_state=1)
# x_train.head(0)
x_test.head()

24915     you are a fat geeky prick who has nothing to d...
75819     agent x2 basically thanks  with a little more ...
53891     why are my posts being deleted \n\ni have trie...
154159    \n\n controlled demolitions and common sense  ...
13040       i do not understand your reply  blaxthos  t  c 
Name: comment_text, dtype: object

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Sample data
sample_size = 20000  # Adjust sample size as needed
x_sample = X[:sample_size]
y_sample = y[:sample_size]

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# SVM Classifier with Grid Search
svm_pipeline = Pipeline([
    ('svm', SVC())
])

# Define parameters for grid search
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=3, n_jobs=-1)
grid_search.fit(x_train_tfidf, y_train_encoded)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(x_test_tfidf, y_test_encoded)
print("Test set score:", test_score*100)




Best parameters found: {'svm__C': 1, 'svm__kernel': 'linear'}
Best cross-validation score: 0.9463125073926043
Test set score: 94.8


In [8]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = grid_search.predict(x_test_tfidf)
class_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

report = classification_report(y_test_encoded, y_pred, target_names=None)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3583
           1       0.94      0.53      0.68       417

    accuracy                           0.95      4000
   macro avg       0.95      0.76      0.83      4000
weighted avg       0.95      0.95      0.94      4000



In [12]:
# Preprocess the input string (assuming preprocess_text function is defined)
def preprocess_input(text):
    # Apply the same preprocessing steps used during training
    text = preprocess_text(text)
    return text

# Vectorize the preprocessed input string
def vectorize_input(input_string, tfidf_vectorizer):
    # Preprocess the input string
    preprocessed_input = preprocess_input(input_string)
    # Vectorize the preprocessed input using the TF-IDF vectorizer
    input_vector = tfidf_vectorizer.transform([preprocessed_input])
    return input_vector

# Example input string to check toxicity
input_string = input("Enter a comment: ")

# Vectorize the input string
input_vector = vectorize_input(input_string, tfidf_vectorizer)

# Use the trained SVM model to predict toxicity
predicted_label = grid_search.predict(input_vector)[0]

# Determine if the comment is toxic or not based on the predicted label
is_toxic = predicted_label == 1

# Print the prediction
print("Input string:", input_string)
print("Predicted toxicity:", "Toxic" if is_toxic else "Not Toxic")


Input string: i hate nigger
Predicted toxicity: Toxic


In [15]:
# svm_classifier = SVC()

# # Train the SVM classifier
# svm_classifier.fit(x_train_tfidf, y_train)

# # Make predictions on the test data
# y_pred = svm_classifier.predict(x_test_tfidf)

# # Calculate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)