In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')
    

In [None]:

# Load dataset
df = pd.read_csv('/mnt/data/hateXplain.csv')

# Display the first few rows of the dataset
df.head()
    

In [None]:

# Step 1: Data Cleaning
# Check for missing values and drop them if necessary
df.isnull().sum()  # Show the number of NaN values in each column
df.dropna(inplace=True)  # Drop rows with missing values
    

In [None]:

# Step 2: Basic Text Preprocessing
# Lowercasing, removing special characters, tokenization
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
df['comment'] = df['comment'].apply(preprocess_text)
    

In [None]:

# Step 3: Sentiment Analysis using VADER
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment scores
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment

# Apply sentiment analysis to each comment and store the results
df['sentiment'] = df['comment'].apply(get_sentiment)

# Extract sentiment components into separate columns
df['sentiment_neg'] = df['sentiment'].apply(lambda x: x['neg'])
df['sentiment_neu'] = df['sentiment'].apply(lambda x: x['neu'])
df['sentiment_pos'] = df['sentiment'].apply(lambda x: x['pos'])
df['sentiment_compound'] = df['sentiment'].apply(lambda x: x['compound'])
    

In [None]:

# Step 4: Visualization
# 4.1 Frequency of each type of hate (from the 'label' column)
sns.countplot(x='label', data=df)
plt.title('Frequency of Hate Types')
plt.show()
    

In [None]:

# 4.2 Distribution of text length (number of words) for different hate categories
df['comment_length'] = df['comment'].apply(lambda x: len(x.split()))
sns.boxplot(x='label', y='comment_length', data=df)
plt.title('Text Length Distribution by Hate Type')
plt.show()
    

In [None]:

# 4.3 Most common words for each hate type
hate_types = df['label'].unique()

for hate_type in hate_types:
    hate_comments = df[df['label'] == hate_type]['comment']
    vectorizer = CountVectorizer(stop_words='english', max_features=10)
    word_count = vectorizer.fit_transform(hate_comments)
    word_freq = np.array(word_count.sum(axis=0)).flatten()
    words = vectorizer.get_feature_names_out()

    word_freq_df = pd.DataFrame({'word': words, 'count': word_freq})
    plt.figure(figsize=(8, 5))
    sns.barplot(x='count', y='word', data=word_freq_df)
    plt.title(f'Most Common Words in {hate_type} Comments')
    plt.show()
    

In [None]:

# 4.4 Sentiment analysis for each hate category
sns.boxplot(x='label', y='sentiment_compound', data=df)
plt.title('Sentiment Compound Scores by Hate Type')
plt.show()
    