# Initial Steps
- Install Dependencies: 
- Make sure to install the necessary Python libraries:

In [1]:
pip install pandas scikit-learn tensorflow nltk joblib

Note: you may need to restart the kernel to use updated packages.


# Read CSV files

In [2]:
import pandas as pd

# Load the Jigsaw Toxic Comment Classification Dataset (Kaggle)
jigsaw_df = pd.read_csv('../jigsaw_toxic_comment_dataset_train.csv')

# Load the Hate Speech and Offensive Language Dataset
hate_speech_df = pd.read_csv('../hate_speech_and_offensive_language_labeled_data.csv')

print(jigsaw_df.head())
print(hate_speech_df.head())


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      

# Combine the datasets
- Hate Speech Dataset: Label "hate speech" and "offensive language" as inappropriate
- Jigsaw Dataset: Create "Inappropriate" label based on toxic, severe_toxic, obscene, threat, insult, identity_hate


In [3]:
# Combine the datasets
# Jigsaw Dataset: Create "Inappropriate" label based on toxic, severe_toxic, obscene, threat, insult, identity_hate
jigsaw_df['inappropriate'] = jigsaw_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
jigsaw_df['inappropriate'] = jigsaw_df['inappropriate'].apply(lambda x: 1 if x > 0 else 0)
jigsaw_df = jigsaw_df[['comment_text', 'inappropriate']]

# Hate Speech Dataset: Label "hate speech" and "offensive language" as inappropriate
hate_speech_df['inappropriate'] = hate_speech_df['class'].apply(lambda x: 1 if x == 0 or x == 1 else 0)
hate_speech_df = hate_speech_df[['tweet', 'inappropriate']].rename(columns={'tweet': 'comment_text'})

# Combine both datasets
combined_df = pd.concat([jigsaw_df, hate_speech_df], ignore_index=True)


In [4]:
print(combined_df.head())
# Assuming your DataFrame is called 'df' and the class label column is 'label'
inappropriate_rows = combined_df[combined_df['inappropriate'] == 1]

# Print the filtered rows
print(inappropriate_rows)


                                        comment_text  inappropriate
0  Explanation\nWhy the edits made under my usern...              0
1  D'aww! He matches this background colour I'm s...              0
2  Hey man, I'm really not trying to edit war. It...              0
3  "\nMore\nI can't make any real suggestions on ...              0
4  You, sir, are my hero. Any chance you remember...              0
                                             comment_text  inappropriate
6            COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK              1
12      Hey... what is it..\n@ | talk .\nWhat is it......              1
16      Bye! \n\nDon't look, come or think of comming ...              1
42      You are gay or antisemmitian? \n\nArchangel WH...              1
43               FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!              1
...                                                   ...            ...
184347                                 you're all niggers              1
184348  

# Text Cleaning

Text cleaning is crucial to remove unwanted characters, symbols, and patterns that may not contribute to the model's performance. This step also reduces noise in the data.

1. Lowercasing: Convert all text to lowercase to avoid treating the same words differently based on capitalization.
2. Remove Special Characters: Remove punctuation, special symbols, and numbers as they may not add value for the model.
3. Remove URLs and Emails: Links and emails are often not useful and should be removed.
4. Remove Stopwords: Stopwords (e.g., "the", "is", "in") are common words that can be removed as they don't carry significant meaning.
5. Expand Contractions: Convert contractions like "don't" to "do not" to standardize the language.

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Dictionary of common contractions
contractions = {
    "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", 
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
    "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", 
    "i'd": "I would", "i'll": "I will", "i'm": "I am", "i've": "I have", "isn't": "is not", 
    "it's": "it is", "let's": "let us", "mightn't": "might not", "mustn't": "must not", 
    "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", 
    "shouldn't": "should not", "that's": "that is", "there's": "there is", "they'd": "they would", 
    "they'll": "they will", "they're": "they are", "they've": "they have", "we'd": "we would", 
    "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
    "what're": "what are", "what's": "what is", "what've": "what have", "where's": "where is", 
    "who'd": "who would", "who'll": "who will", "who're": "who are", "who's": "who is", 
    "who've": "who have", "won't": "will not", "wouldn't": "would not", "you'd": "you would", 
    "you'll": "you will", "you're": "you are", "you've": "you have"
}

# Function to expand contractions
def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = expand_contractions(text, contractions)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '', text)
    
    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # other symbols
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply the cleaning function to the 'comment_text' column
combined_df['cleaned_comment'] = combined_df['comment_text'].apply(clean_text)

# View the cleaned data
print(combined_df[['comment_text', 'cleaned_comment']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saiaravindpunnam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        comment_text  \
0  Explanation\nWhy the edits made under my usern...   
1  D'aww! He matches this background colour I'm s...   
2  Hey man, I'm really not trying to edit war. It...   
3  "\nMore\nI can't make any real suggestions on ...   
4  You, sir, are my hero. Any chance you remember...   

                                     cleaned_comment  
0  explanation edits made username hardcore metal...  
1  daww matches background colour I seemingly stu...  
2  hey man I really trying edit war guy constantl...  
3  cannot make real suggestions improvement wonde...  
4                      sir hero chance remember page  


# Text Normalization and Lemmatization
- Normalization standardizes the text data by reducing words to their base forms. 
- Lemmatization ensures that words are reduced to their dictionary form (e.g., "running" becomes "run").
- This helps improve the model’s generalization.

In [6]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization
combined_df['lemmatized_comment'] = combined_df['cleaned_comment'].apply(lemmatize_text)

# View the lemmatized data
print(combined_df[['cleaned_comment', 'lemmatized_comment', 'inappropriate']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saiaravindpunnam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                     cleaned_comment  \
0  explanation edits made username hardcore metal...   
1  daww matches background colour I seemingly stu...   
2  hey man I really trying edit war guy constantl...   
3  cannot make real suggestions improvement wonde...   
4                      sir hero chance remember page   

                                  lemmatized_comment  inappropriate  
0  explanation edits made username hardcore metal...              0  
1  daww match background colour I seemingly stuck...              0  
2  hey man I really trying edit war guy constantl...              0  
3  cannot make real suggestion improvement wonder...              0  
4                      sir hero chance remember page              0  


# Text Vectorization (TF-IDF (basic) or Word Embeddings (More contextual :TODO ))
TF-IDF: Convert the text data into numerical form using TF-IDF (Term Frequency-Inverse Document Frequency), which gives more importance to rare words.
## Word Embeddings (Advanced Option): 
- Instead of TF-IDF, We can use pre-trained word embeddings like Word2Vec, GloVe, or BERT. For example, using pre-trained BERT embeddings will allow the model to capture deeper semantic relationships.
- For this, We can utilize Hugging Face's Transformers library to get the BERT embeddings for each comment.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')

# Fit and transform the lemmatized text
X_tfidf = tfidf.fit_transform(combined_df['lemmatized_comment'])

# Check the shape of the TF-IDF matrix
print(X_tfidf.shape)


(184354, 10000)


# Splitting the Dataset
- Split the data into training and validation sets.

In [8]:
from sklearn.model_selection import train_test_split

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, combined_df['inappropriate'], test_size=0.2, random_state=2)

print(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")


Training set shape: (147483, 10000), Validation set shape: (36871, 10000)


# Building a Simple Logistic Regression Model (Binary Classification Model)
- Here, we’ll build a simple logistic regression model to classify comments as toxic or not.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize the model
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_pred, target_names=['appropriate', 'inappropriate']))


               precision    recall  f1-score   support

  appropriate       0.97      0.96      0.97     29495
inappropriate       0.85      0.89      0.87      7376

     accuracy                           0.95     36871
    macro avg       0.91      0.93      0.92     36871
 weighted avg       0.95      0.95      0.95     36871



## Metrics Breakdown:

1. **Precision**:
   - **Appropriate**: 0.97
     - This means that out of all the content classified as "appropriate", 97% was correctly classified.
   - **Inappropriate**: 0.85
     - This means that out of all the content classified as "inappropriate", 85% was correctly classified.

2. **Recall**:
   - **Appropriate**: 0.96
     - This means that out of all the actual "appropriate" content, 96% was correctly identified by the model.
   - **Inappropriate**: 0.89
     - This means that out of all the actual "inappropriate" content, 89% was correctly identified by the model.

3. **F1-Score**:
   - **Appropriate**: 0.97
     - The harmonic mean of precision and recall for the "appropriate" class, indicating a balanced performance.
   - **Inappropriate**: 0.87
     - The harmonic mean of precision and recall for the "inappropriate" class, indicating a good balance between precision and recall.

4. **Support**:
   - **Appropriate**: 29,495
     - The number of actual "appropriate" samples in the dataset.
   - **Inappropriate**: 7,376
     - The number of actual "inappropriate" samples in the dataset.

5. **Accuracy**:
   - Overall accuracy: 0.95
     - The proportion of correctly classified samples (both "appropriate" and "inappropriate") out of the total samples.

6. **Macro Average**:
   - Average performance across all classes, treating each class equally:
     - Precision: 0.91
     - Recall: 0.93
     - F1-Score: 0.92

7. **Weighted Average**:
   - Average performance across all classes, weighted by the number of instances in each class:
     - Precision: 0.95
     - Recall: 0.95
     - F1-Score: 0.95

### Interpretation:

- **High Precision for "Appropriate"**: Indicates that the model is very good at identifying content as "appropriate" when it actually is.
- **Good Recall for "Inappropriate"**: Shows that the model is effective at identifying a large portion of the actual "inappropriate" content.
- **Balanced Performance**: The F1-scores suggest a good balance between precision and recall for both classes.

Overall, the model performs well in distinguishing between "appropriate" and "inappropriate" content, with particularly strong performance in identifying "appropriate" content and good performance in identifying "inappropriate" content.

## Grid Search
- **Grid Search** is a technique used for hyperparameter tuning in machine learning. 
- The goal is to find the best combination of hyperparameters for a given model to optimize its performance.
- Grid Search is a powerful tool for optimizing machine learning models by systematically evaluating hyperparameter combinations to find the best settings for your model.
- It enhances model performance and ensures more reliable results by using cross-validation to assess each set of hyperparameters.

### How Grid Search Works:

1. **Define Hyperparameter Space:**
   - Specify a set of hyperparameters we want to tune and the range of values for each hyperparameter. For example, We want to tune the regularization strength (`C`) and the `max_df` parameter in a TF-IDF vectorizer.

2. **Create a Grid:**
   - Grid Search creates a Cartesian product of the hyperparameter values specified. This means it evaluates all possible combinations of the given hyperparameter values.

3. **Cross-Validation:**
   - For each combination of hyperparameters, Grid Search performs cross-validation. This involves splitting the training data into multiple folds and training the model on different subsets while validating it on the remaining fold. This helps in assessing the performance of each hyperparameter combination more reliably.

4. **Evaluate Performance:**
   - The model is evaluated based on a specified scoring metric (e.g., accuracy, F1-score, etc.). Grid Search computes the average performance metric across all cross-validation folds for each hyperparameter combination.

5. **Select the Best Hyperparameters:**
   - The combination of hyperparameters that achieves the best performance (according to the chosen metric) is selected as the optimal set of hyperparameters.

6. **Train the Final Model:**
   - The model is retrained on the full training dataset using the best hyperparameters found through Grid Search.

### Benefits of Grid Search:

- **Systematic Search:**
  - It exhaustively searches through all possible hyperparameter combinations, ensuring that the best possible set of hyperparameters is found within the specified grid.

- **Cross-Validation:**
  - By using cross-validation, Grid Search provides a more reliable estimate of model performance and reduces the risk of overfitting.


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import joblib

# Split data into train and validation sets
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(combined_df['lemmatized_comment'], combined_df['inappropriate'], test_size=0.2, random_state=2)

# Define the pipeline with TF-IDF vectorizer and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # TF-IDF vectorizer
    ('clf', LogisticRegression(class_weight='balanced'))  # Classifier
])

# Define the hyperparameters to tune
parameters = {
    'tfidf__max_df': [0.75, 0.85],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__C': [0.1, 1, 10],
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='f1', n_jobs=-1)

# Fit Grid Search to the data
grid_search.fit(X_train_raw, y_train_raw)

# Retrieve the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

# Save the best model
joblib.dump(best_model, 'best_model_grid_search.pkl')

# Make predictions and evaluate the best model
y_pred_raw = best_model.predict(X_val_raw)
print("Best Model Performance:")
print(classification_report(y_val_raw, y_pred_raw, target_names=['appropriate', 'inappropriate']))




Best Parameters: {'clf__C': 10, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}
Best Model Performance:
               precision    recall  f1-score   support

  appropriate       0.97      0.97      0.97     29495
inappropriate       0.88      0.89      0.89      7376

     accuracy                           0.95     36871
    macro avg       0.93      0.93      0.93     36871
 weighted avg       0.95      0.95      0.95     36871



In [14]:
import joblib

# Load the entire pipeline
best_model = joblib.load('best_model_grid_search.pkl')

print(best_model.named_steps['tfidf'].idf_)

# Assuming best_model is your fitted pipeline from GridSearchCV

# Define test texts
test_texts = ["This is fine", "You are a f*cking idiot", "I love this!", "You b*tch", "fucking bitch"]

# Make predictions using the best pipeline model
y_pred_test = best_model.predict(test_texts)

# Print predictions
for text, prediction in zip(test_texts, y_pred_test):
    print(f'Text: "{text}" - Prediction: {"inappropriate" if prediction == 1 else "appropriate"}')



[ 9.957036   12.20832779 12.20832779 ... 12.20832779 12.20832779
 12.20832779]
Text: "This is fine" - Prediction: appropriate
Text: "You are a f*cking idiot" - Prediction: inappropriate
Text: "I love this!" - Prediction: appropriate
Text: "You b*tch" - Prediction: appropriate
Text: "fucking bitch" - Prediction: inappropriate


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have your classifier and test data ready
# Let's assume 'y_val' are the true labels and 'y_pred_proba' are predicted probabilities for class 1 (inappropriate class)

# Get predicted probabilities for the positive class (inappropriate content)
y_pred_proba = best_model.predict_proba(X_val_raw)[:, 1]

# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_val_raw, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


: 