**DATA PROCESSING**

In [7]:
import pandas as pd
import re
import random 
import uuid
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

# ✅ 1. قراءة ملف txt → كل سطر يحتوي على: text, label
with open("raw_data.txt", encoding="utf-16") as file:
    lines = file.readlines()
lines = lines[1:]


# تحويل إلى list of dicts
data = []
for line in lines:
    if ',' in line:
        text_part, label_part = line.strip().split(',', 1)
        text = text_part.strip()
        label = label_part.strip()
        data.append({'text': text, 'label': int(label)})

df_raw = pd.DataFrame(data)

# ✅ 2. دالة تقسيم لو فيه أكتر من شكوى داخل نفس الـ text باستخدام أرقام أو علامات واضحة
def split_complaints(text):
    parts = re.split(r'\s*\d+\s*[-.).:]\s*', text)
    return [p.strip() for p in parts if p.strip()]

label_to_category = {
    0: "Internet",
    1: "Payment",
    2: "Customer Support",
    3: "Cancellation"
}

records = []
for _, row in df_raw.iterrows():
    label = row['label']
    text = row['text']
    complaints = split_complaints(text)
    for complaint in complaints:
        records.append({'label': label, 'text': complaint})

df = pd.DataFrame(records)

# ✅ 3. تجهيز الأعمدة الاحترافية
def generate_subject(text):
    return ' '.join(text.split()[:6]) + "..."

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df["ticket_id"] = ["TK-" + str(i+1).zfill(5) for i in range(len(df))]
df["subject"] = df["text"].apply(generate_subject)
df["category"] = df["label"].map(label_to_category)
df["language"] = df["text"].apply(detect_language)
df["user_id"] = ["USR-" + str(i+1).zfill(5) for i in range(len(df))]
df["user_email"] = ["user" + str(i+1) + "@example.com" for i in range(len(df))]
df["password"] = ""

final_df = df[["ticket_id", "subject", "text", "category", "language", "user_id", "user_email", "password"]]

# ✅ 4. تصدير الـ CSV النهائي
final_df.to_csv("support_tickets_cleaned.csv", index=False, encoding="utf-8-sig")

print("✅ تم تجهيز الملف → support_tickets_cleaned.csv")


ValueError: invalid literal for int() with base 10: 'but still no internet service.,0'

In [10]:
import pandas as pd
import csv
import re
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0  # لتثبيت نتائج كشف اللغة

# ===============================
# إعداد التصنيفات النهائية
label_to_category = {
    0: "Internet",
    1: "Payment",
    2: "Customer Support",
    3: "Cancellation"
}

# ===============================
# دالة تقسيم الشكوى لو فيها أرقام → ممكن تلغيه لو متأكدة أن كل سطر فيه شكوى واحدة
def split_complaints(text):
    parts = re.split(r'\s*\d+\s*[-.).:]\s*', text)
    return [p.strip() for p in parts if p.strip()]

# ===============================
# قراءة ملف الـ txt بشكل احترافي باستخدام csv.reader
data = []
with open("raw_data.txt", encoding="utf-16") as file:
    reader = csv.reader(file, delimiter=',', quotechar='"')
    next(reader)  # تجاهل الـ header
    
    for row in reader:
        if len(row) >= 2:
            text = ','.join(row[:-1]).strip()  # النص الكامل ← لو فيه فواصل داخل النص
            label = row[-1].strip()
            try:
                label = int(label)
                complaints = split_complaints(text)
                for complaint in complaints:
                    data.append({'text': complaint, 'label': label})
            except ValueError:
                print(f"❗ مشكلة في السطر → {row}")

df = pd.DataFrame(data)

# ===============================
# تجهيز الأعمدة الإضافية
def generate_subject(text):
    return ' '.join(text.split()[:6]) + "..."

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df["ticket_id"] = ["TK-" + str(i+1).zfill(5) for i in range(len(df))]
df["subject"] = df["text"].apply(generate_subject)
df["category"] = df["label"].map(label_to_category)
df["language"] = df["text"].apply(detect_language)
df["user_id"] = ["USR-" + str(i+1).zfill(5) for i in range(len(df))]
df["user_email"] = ["user" + str(i+1) + "@example.com" for i in range(len(df))]
df["password"] = ""

final_df = df[["ticket_id", "subject", "text", "category", "language", "user_id", "user_email", "password"]]

# ===============================
# تصدير الملف النهائي
final_df.to_csv("support_tickets_cleaned.csv", index=False, encoding="utf-8-sig")

print("✅ تم تجهيز الملف بشكل كامل → support_tickets_cleaned.csv")


✅ تم تجهيز الملف بشكل كامل → support_tickets_cleaned.csv
