In [10]:
# For text preprocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# For sentiment analysis with TextBlob
from textblob import TextBlob

# For Bag of Words and TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# For Naive Bayes classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Optional - for VADER sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Make sure to download any required NLTK datasets if you haven't already
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/sayo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sayo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sayo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sayo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Load the text file located in the 'data' folder
try:
    with open('/Users/sayo/personal_projects/Usafe_bot/data/hate_crime.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    print("Text loaded successfully!")
except FileNotFoundError:
    print("The file 'hate_crime.txt' was not found in the 'data' folder. Please check the path.")
except Exception as e:
    print("An error occurred:", e)

Text loaded successfully!


In [12]:
# Initialize the stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Split sections by headings that follow the pattern '== Heading =='
sections = re.split(r'== (.*?) ==', text)

# Dictionary to store cleaned sections
processed_sections = {}

# Loop through sections, process each one by keeping heading and content pairs
for i in range(1, len(sections), 2):  # Start at 1 and increment by 2 to get heading/content pairs
    heading = sections[i].strip()  # Section heading
    content = sections[i + 1]  # Section content
    
    # Tokenize, remove stopwords, and apply lemmatization
    tokens = word_tokenize(content)
    cleaned_tokens = [
        lemmatizer.lemmatize(word.lower()) 
        for word in tokens 
        if word.isalpha() and word.lower() not in stop_words
    ]
    
    # Join cleaned tokens back into a single string for the section content
    processed_sections[heading] = ' '.join(cleaned_tokens)

# Display a sample of the processed sections
for heading, content in processed_sections.items():
    print(f"Heading: {heading}")
    print(f"Content: {content[:200]}...")  # Display the first 200 characters of each section
    print("\n---\n")

Heading: History
Content: term hate crime came common usage united state often used retrospectively order describe event occurred prior era roman persecution christian nazi slaughter jew hate crime committed individual well go...

---

Heading: Psychological effects
Content: hate crime significant psychological consequence direct victim others group well moreover victim hate crime often experience sense victimization go beyond initial crime creating heightened sense vulne...

---

Heading: Motivation
Content: sociologist jack mcdevitt jack levin study motif hate crime found four motif reported accounted percent hate crime overall united state perpetrator engage hate crime excitement drama often greater pur...

---

Heading: Risk management for hate-crime offenders
Content: compared type offending relatively little research directed towards management offender however risk management offender important consideration forensic psychology public safety order decrease potent...

---

Headin