In [None]:
import pandas as pd
import numpy as np
import requests
import os
import zipfile
from datasets import load_dataset
import json
from sklearn.model_selection import train_test_split
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:
def download_twitter_dataset():
    twitter_url = "https://www.kaggle.com/api/v1/datasets/download/thoughtvector/customer-support-on-twitter"
    if not os.path.exists('../data/raw'):
        os.makedirs('../data/raw')
    
    if not os.path.exists('../data/raw/customer_support_twitter.csv'):
        response = requests.get(twitter_url)
        with open('../data/raw/twitter_data.zip', 'wb') as f:
            f.write(response.content)
        
        with zipfile.ZipFile('../data/raw/twitter_data.zip', 'r') as zip_ref:
            zip_ref.extractall('../data/raw')
    
    return pd.read_csv('../data/raw/twcs.csv')

def load_bitext_dataset():
    dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
    return pd.DataFrame(dataset['train'])

def create_synthetic_data():
    categories = ['billing', 'technical', 'general_inquiry', 'complaint', 'compliment', 'account']
    priorities = ['high', 'medium', 'low']
    
    synthetic_texts = [
        "My internet connection keeps dropping every few minutes",
        "I was charged twice for my monthly subscription",
        "How do I reset my password?",
        "Your service is terrible, I want a refund",
        "Great customer service, thank you for the help",
        "I need to update my billing address",
        "The app crashes when I try to log in",
        "Can you explain the new pricing structure?",
        "My account has been suspended without notice",
        "Unable to access my dashboard"
    ]
    
    synthetic_data = []
    for i, text in enumerate(synthetic_texts):
        synthetic_data.append({
            'ticket_id': f'SYNTH_{i+1:03d}',
            'text': text,
            'category': np.random.choice(categories),
            'priority': np.random.choice(priorities),
            'estimated_hours': np.random.uniform(0.5, 48.0)
        })
    
    return pd.DataFrame(synthetic_data)

twitter_df = download_twitter_dataset()
bitext_df = load_bitext_dataset()
synthetic_df = create_synthetic_data()

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

def standardize_twitter_data(df):
    df_clean = df[df['text'].notna()].copy()
    df_clean['text'] = df_clean['text'].apply(clean_text)
    df_clean = df_clean[df_clean['text'].str.len() > 10]
    
    df_clean['ticket_id'] = 'TW_' + df_clean.index.astype(str)
    df_clean['category'] = 'general_inquiry'
    df_clean['priority'] = np.random.choice(['high', 'medium', 'low'], size=len(df_clean))
    df_clean['estimated_hours'] = np.random.uniform(1, 24, size=len(df_clean))
    
    return df_clean[['ticket_id', 'text', 'category', 'priority', 'estimated_hours']]

def standardize_bitext_data(df):
    df_clean = df.copy()
    df_clean['text'] = df_clean['instruction'].apply(clean_text)
    df_clean = df_clean[df_clean['text'].str.len() > 10]
    
    df_clean['ticket_id'] = 'BT_' + df_clean.index.astype(str)
    
    category_mapping = {
        'account': 'account',
        'billing': 'billing',
        'technical': 'technical',
        'general': 'general_inquiry'
    }
    
    df_clean['category'] = df_clean.get('category', 'general_inquiry')
    df_clean['priority'] = np.random.choice(['high', 'medium', 'low'], size=len(df_clean))
    df_clean['estimated_hours'] = np.random.uniform(0.5, 12, size=len(df_clean))
    
    return df_clean[['ticket_id', 'text', 'category', 'priority', 'estimated_hours']]

twitter_standardized = standardize_twitter_data(twitter_df.head(1000))
bitext_standardized = standardize_bitext_data(bitext_df.head(500))
synthetic_standardized = synthetic_df.copy()

combined_df = pd.concat([
    twitter_standardized,
    bitext_standardized,
    synthetic_standardized
], ignore_index=True)

combined_df = combined_df.drop_duplicates(subset=['text']).reset_index(drop=True)

In [None]:
train_df, temp_df = train_test_split(combined_df, test_size=0.3, random_state=42, stratify=combined_df['category'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['category'])

if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')

train_df.to_csv('../data/processed/train_data.csv', index=False)
val_df.to_csv('../data/processed/val_data.csv', index=False)
test_df.to_csv('../data/processed/test_data.csv', index=False)
combined_df.to_csv('../data/processed/full_dataset.csv', index=False)

dataset_stats = {
    'total_samples': len(combined_df),
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'categories': combined_df['category'].value_counts().to_dict(),
    'priority_distribution': combined_df['priority'].value_counts().to_dict()
}

with open('../data/processed/dataset_stats.json', 'w') as f:
    json.dump(dataset_stats, f, indent=2)