In [41]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1️⃣ Initialize NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 2️⃣ Load Data & Verify Structure
try:
    df = pd.read_csv("cleaned_chat.csv")  # Replace with your dataset path
    print("Columns found:", df.columns.tolist())
except FileNotFoundError:
    print("❌ Error: File not found. Check the file path.")
    exit()

# 3️⃣ Preprocessing Function (Define FIRST)
def preprocess_text(text):
    """Clean and lemmatize text data."""
    if not isinstance(text, str) or pd.isna(text):
        return "empty"
    
    text = text.lower()
    text = re.sub(r'\d+|[^\w\s]', '', text)  # Remove numbers & punctuation
    text = text.strip()
    
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    words = [
        lemmatizer.lemmatize(w) 
        for w in words 
        if w not in stop_words and len(w) > 2
    ]
    
    return " ".join(words) if words else "empty"

# 4️⃣ Apply Preprocessing Safely
if 'Message' not in df.columns:
    print("❌ Error: 'Message' column missing.")
    exit()

df['Message'] = df['Message'].fillna('').astype(str)  # Handle NaN/non-text
df['Cleaned_Message'] = df['Message'].apply(preprocess_text)

# 5️⃣ Labeling with Keyword Matching
flirty_keywords = ["love", "kiss", "miss you", "cute", "babe", "baby", "😍", "😘", "❤️"]
df['Label'] = df['Cleaned_Message'].apply(
    lambda x: 1 if any(word in x for word in flirty_keywords) else 0
)

# 6️⃣ Remove Empty Messages
df = df[df['Cleaned_Message'] != "empty"]

# 7️⃣ TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned_Message'])
y = df['Label']

# 8️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 9️⃣ Train Logistic Regression
model = LogisticRegression(max_iter=1000)  # Increase iterations for convergence
model.fit(X_train, y_train)

# 🔟 Evaluate Model
y_pred = model.predict(X_test)
print("\n✅ Model Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# 🎯 Prediction Function
def predict_flirty(text):
    """Predict if a new message is flirty."""
    processed_text = preprocess_text(text)
    if processed_text == "empty":
        return "Invalid input 🚫"
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return "Flirty 💕" if prediction[0] == 1 else "Not Flirty 😐"

# Test
test_message = "Hey babe, I miss you 😘"
print("\n🧐 Prediction for:", test_message)
print("🔥 Result:", predict_flirty(test_message))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JOSE\anaconda3\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JOSE\anaconda3\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JOSE\anaconda3\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Columns found: ['DateTime', 'Sender', 'Message']

✅ Model Accuracy: 0.9975589910496339

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1226
           1       0.00      0.00      0.00         3

    accuracy                           1.00      1229
   macro avg       0.50      0.50      0.50      1229
weighted avg       1.00      1.00      1.00      1229


🧐 Prediction for: Hey babe, I miss you 😘
🔥 Result: Not Flirty 😐


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
