In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

class FoulLanguageDetector:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.model = LogisticRegression(random_state=42)
        self.stop_words = set(stopwords.words('english'))
    
    def preprocess_text(self, text):
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize and remove stopwords
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        
        return ' '.join(tokens)
    
    def prepare_data(self, data_path):
        # Load the data
        df = pd.read_csv(data_path)
        
        # Ensure the DataFrame has 'text' and 'label' columns
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("DataFrame must contain 'text' and 'label' columns")
        
        # Preprocess the text
        df['processed_text'] = df['text'].apply(self.preprocess_text)
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            df['processed_text'],
            df['label'],
            test_size=0.2,
            random_state=42
        )
        
        return X_train, X_test, y_train, y_test
    
    def train(self, X_train, y_train):
        # Transform text to TF-IDF features
        X_train_vectorized = self.vectorizer.fit_transform(X_train)
        
        # Train the model
        self.model.fit(X_train_vectorized, y_train)
    
    def evaluate(self, X_test, y_test):
        # Transform test data
        X_test_vectorized = self.vectorizer.transform(X_test)
        
        # Make predictions
        y_pred = self.model.predict(X_test_vectorized)
        
        # Print evaluation metrics
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
    
    def predict(self, text):
        # Preprocess the input text
        processed_text = self.preprocess_text(text)
        
        # Transform the text
        text_vectorized = self.vectorizer.transform([processed_text])
        
        # Make prediction
        prediction = self.model.predict(text_vectorized)
        probability = self.model.predict_proba(text_vectorized)
        
        return {
            'is_foul': bool(prediction[0]),
            'confidence': float(max(probability[0]))
        }

# Example usage
def main():
    # Initialize the detector
    detector = FoulLanguageDetector()
    
    try:
        # Prepare the data
        X_train, X_test, y_train, y_test = detector.prepare_data('twitter_data.csv')
        
        # Train the model
        detector.train(X_train, y_train)
        
        # Evaluate the model
        detector.evaluate(X_test, y_test)
        
        # Example prediction
        sample_text = "This is a sample tweet to test the model"
        result = detector.predict(sample_text)
        print(f"\nPrediction for sample text:")
        print(f"Is foul: {result['is_foul']}")
        print(f"Confidence: {result['confidence']:.2f}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'nltk'