<a href="https://colab.research.google.com/github/Shenoda7/Accordion/blob/main/NLPpractical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import kagglehub
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Download dataset from Kaggle
path = kagglehub.dataset_download("fizzbuzz/cleaned-toxic-comments")

# Print the path where the dataset is saved
print("Path to dataset files:", path)

# Load the training and testing datasets
train_data = pd.read_csv(f"{path}/train_preprocessed.csv")
test_data = pd.read_csv(f"{path}/test_preprocessed.csv")

# Display dataset structure to ensure correct loading
print("Training Data Info:")
print(train_data.info())
print("Testing Data Info:")
print(test_data.info())

# Step 2: Data preprocessing
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the text is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text
    return ""  # Return empty string if text is not a string

# Apply preprocessing to the 'comment_text' column (which contains the text of the comments)
train_data['cleaned_comment'] = train_data['comment_text'].apply(preprocess_text)
test_data['cleaned_comment'] = test_data['comment_text'].apply(preprocess_text)

# Step 3: Define features and target for training and testing sets
X_train = train_data['cleaned_comment']
y_train = train_data['toxic']  # Assuming 'toxic' is the target column
X_test = test_data['cleaned_comment']
y_test = test_data['toxic']

# Handle missing values (NaN) in target columns
y_train = y_train.fillna(0)  # Fill NaN in training target with 0 (Non-toxic)
y_test = y_test.fillna(0)  # Fill NaN in testing target with 0 (Non-toxic)

# Step 4: Text vectorization (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 5: Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Step 6: Model evaluation
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Test with new comments
def predict_toxicity(comment):
    cleaned_comment = preprocess_text(comment)
    vectorized_comment = tfidf.transform([cleaned_comment])
    prediction = model.predict(vectorized_comment)
    return "Toxic" if prediction[0] == 1 else "Non-Toxic"

# Test examples
example_comment_1 = "You are a wonderful pig!"
print(f"Comment: '{example_comment_1}' -> Prediction: {predict_toxicity(example_comment_1)}")

example_comment_2 = "You are so ass."
print(f"Comment: '{example_comment_2}' -> Prediction: {predict_toxicity(example_comment_2)}")


Path to dataset files: /root/.cache/kagglehub/datasets/fizzbuzz/cleaned-toxic-comments/versions/1
Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   comment_text   159571 non-null  object 
 1   id             159571 non-null  object 
 2   identity_hate  159571 non-null  float64
 3   insult         159571 non-null  float64
 4   obscene        159571 non-null  float64
 5   set            159571 non-null  object 
 6   severe_toxic   159571 non-null  float64
 7   threat         159571 non-null  float64
 8   toxic          159571 non-null  float64
 9   toxicity       159571 non-null  float64
dtypes: float64(7), object(3)
memory usage: 12.2+ MB
None
Testing Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.82      0.90    153164
         1.0       0.00      0.00      0.00         0

    accuracy                           0.82    153164
   macro avg       0.50      0.41      0.45    153164
weighted avg       1.00      0.82      0.90    153164

Comment: 'You are a wonderful pig!' -> Prediction: Toxic
Comment: 'You are so ass.' -> Prediction: Toxic


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
