In [2]:
import pandas as pd
import re
import spacy
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("../data/feedback.csv")
df.head()


Unnamed: 0,text,label
0,The app crashes every time I upload an image.,technical_issue
1,I really love the new dashboard layout!,praise
2,Please add dark mode to the app.,feature_request
3,I'm very disappointed with your customer service.,complaint
4,How do I change my subscription plan?,question


In [3]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [5]:
# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,The app crashes every time I upload an image.,the app crashes every time i upload an image
1,I really love the new dashboard layout!,i really love the new dashboard layout
2,Please add dark mode to the app.,please add dark mode to the app
3,I'm very disappointed with your customer service.,im very disappointed with your customer service
4,How do I change my subscription plan?,how do i change my subscription plan


In [10]:
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

In [11]:
df['tokens'] = df['clean_text'].apply(tokenize)
df[['clean_text', 'tokens']].head()

Unnamed: 0,clean_text,tokens
0,the app crashes every time i upload an image,"[app, crashes, time, upload, image]"
1,i really love the new dashboard layout,"[love, new, dashboard, layout]"
2,please add dark mode to the app,"[add, dark, mode, app]"
3,im very disappointed with your customer service,"[m, disappointed, customer, service]"
4,how do i change my subscription plan,"[change, subscription, plan]"


In [13]:
# Encode labels (e.g., complaint → 0, praise → 1, etc.)
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Save mapping for reference
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_map)


Label Mapping: {'complaint': np.int64(0), 'feature_request': np.int64(1), 'praise': np.int64(2), 'question': np.int64(3), 'technical_issue': np.int64(4)}


In [14]:
df.to_csv("../data/feedback_cleaned.csv", index=False)