In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Basic machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# For handling imbalanced data
from imblearn.over_sampling import SMOTE

# For text processing
import re
import time


In [None]:
def load_and_explore_data(filepath="customer_support_tickets.csv"):
    """
    Load the support ticket data and perform initial exploration
    This helps us understand what we're working with
    """
    print("\nSTEP 1: Loading and exploring the data")
    print("-" * 50)

    try:
        # Load the CSV file
        df = pd.read_csv(filepath, encoding='utf-8')
        print(f"Successfully loaded dataset with {len(df):,} tickets")
        
        # Show basic information about the dataset
        print(f"Dataset shape: {df.shape[0]} rows x {df.shape[1]} columns")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        # Display column names
        print(f"\nColumns in dataset: {list(df.columns)}")
        
        # Show first few rows to understand the data structure
        print(f"\nFirst 3 rows of data:")
        print(df.head(3))
        
        # Check for missing values
        print(f"\nMissing values per column:")
        missing_data = df.isnull().sum()
        for col, missing in missing_data.items():
            if missing > 0:
                print(f"  {col}: {missing} missing ({missing/len(df)*100:.1f}%)")
        
        if missing_data.sum() == 0:
            print("  No missing values found - excellent data quality")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        print(f"\nDuplicate records: {duplicates} ({duplicates/len(df)*100:.1f}%)")
        
        return df
        
    except FileNotFoundError:
        print(f"Error: Could not find file '{filepath}'")
        print("Please make sure the CSV file is in the same directory as this notebook")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the data
df = load_and_explore_data()