In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import matplotlib.pyplot as plt
import string
import seaborn as sns

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load datasets
data_mental = pd.read_csv('Mental_Health_FAQ.csv')
data_alz = pd.read_csv('full_Chat_data.csv')

# Combine datasets and drop nulls
data = pd.concat([data_alz, data_mental], ignore_index=True).dropna()

# Clean and preprocess
data['Questions'] = data['Questions'].str.lower().str.replace(f"[{string.punctuation}]", '', regex=True)
data['Intent'] = data['Questions']  # Replace with actual intent labels if available

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['Questions'], data['Intent'], test_size=0.2, random_state=42)

# Preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

# Vectorization function
def vectorize_text(X_train, X_test, vectorizer_type='tfidf'):
    vectorizer = TfidfVectorizer() if vectorizer_type == 'tfidf' else CountVectorizer()
    X_train_cleaned = [preprocess_text(text) for text in X_train]
    X_test_cleaned = [preprocess_text(text) for text in X_test]
    X_train_vec = vectorizer.fit_transform(X_train_cleaned)
    X_test_vec = vectorizer.transform(X_test_cleaned)
    return X_train_vec, X_test_vec, vectorizer

# Model training function
def train_model(X_train_vec, y_train, model_type='svm'):
    if model_type == 'svm':
        model = LinearSVC()
        param_grid = {'C': [0.1, 1, 10]}
    else:
        model = RandomForestClassifier()
        param_grid = {'n_estimators': [100, 150]}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_vec, y_train)
    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

# Evaluation function
def evaluate_model(model, X_test_vec, y_test):
    y_pred = model.predict(X_test_vec)
    report = classification_report(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {acc:.2f}\n")
    print(report)
    return report

# Visualize metrics
def plot_metrics(report_text):
    lines = report_text.split('\n')
    macro = [line for line in lines if line.strip().startswith('macro avg')][0].split()[1:4]
    weighted = [line for line in lines if line.strip().startswith('weighted avg')][0].split()[1:4]

    metrics = ['Precision', 'Recall', 'F1-Score']
    macro = list(map(float, macro))
    weighted = list(map(float, weighted))

    df = pd.DataFrame({'Metric': metrics, 'Macro Avg': macro, 'Weighted Avg': weighted})
    df.set_index('Metric', inplace=True)
    df.plot(kind='bar', figsize=(8, 5), colormap='Set2', title='Model Evaluation Metrics')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

# Run pipeline
X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test, 'tfidf')
model = train_model(X_train_vec, y_train, 'svm')
report = evaluate_model(model, X_test_vec, y_test)
plot_metrics(report)
