In [None]:
%pip install nltk pandas matplotlib seaborn wordcloud  
#!/usr/bin/env python3
  

In [None]:
# importing relevant Libraries

from distutils.command.install import install
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import pip
import seaborn as sns 
import os 
import re 
import nltk


In [None]:
# loading the dataset

url = "https://raw.githubusercontent.com/ThriveInternship/Thrive_Internship_ML_A/refs/heads/group-d/customer_support_tickets_dirty.csv" 
df = pd.read_csv(url)
print("Dataset loaded successfully")


In [None]:
df.head()   # display first few rows


In [None]:
df.tail()   # display last few rows

In [None]:
# basic dataset exploration

print("Column info:")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())    # count missing values
print("\nDuplicate rows:", df.duplicated().sum) # count duplicate rows

In [None]:
df['text'] = df['text'].fillna('unknown')     # fill missing text entries
df = df.dropna(subset=['label'])              # drop rows without labels

print("Missing values handled successfully.")
print(df.info())

In [None]:
#text cleaning
def clean_text(t):
    t = re.sub(r'<.*?>', '', t)           # Remove HTML tags
    t = re.sub(r'\d+', '', t)             # Remove numbers
    t = re.sub(r'[!#]+', '', t)           # Remove repeated symbols
    t = re.sub(r'[^\w\s]', '', t)         # Remove punctuation
    return t.strip().lower()

df['text'] = df['text'].apply(clean_text)

In [None]:
# remove duplicates
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print(f"Removed {before - after } duplicate rows.")

In [None]:
df['agent_name'] = df['agent_name'].str.capitalize().str.strip()  # standardize agent names

In [None]:
df.describe(include='all')  # get summary statistics

In [None]:
# visualize class distribution
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='label', order=df['label'].value_counts().index)
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.show()

In [None]:
df['label'].value_counts()  # display class counts

In [None]:
# standardizing categorical labels
df['label'] = df['label'].str.lower().str.strip()

df['label'] = df['label'].replace({
    'accnt': 'account',
    'account': 'account',
    ' account': 'account',
    'billing': 'billing',
    'billng': 'billing',
    'tech': 'technical',
    'tech-support': 'technical',
    'technical': 'technical',
    'other': 'other',
    'othr': 'other'
}) 

print("Labels standardized successfully.") 
print(df['label'].value_counts())  # display updated class counts



In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='label', order=df['label'].value_counts().index)
plt.title("Cleaned Class Distribution")
plt.xticks(rotation=45)
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

df['text_cleaned'] = df['text'].apply(lambda t: ' '.join(
    lemma.lemmatize(w) for w in t.split() if w not in stop
))
df.head()  # display first few rows with cleaned text

In [None]:
# save the cleaned data to the data folder as a csv file.
output_path = os.path.join('..', 'data', 'customer_support_tickets_cleaned.csv')
df.to_csv(output_path, index=False)

In [None]:
df.info()  # final dataset info