## Step 1: Setup Python Packages

In [32]:
import pandas as pd

import re

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Step 2: Loading and Examining the Dataset

In [33]:
import pandas as pd

# Load the Qualitative dataset
file_path = 'Qualitative data.csv'
data = pd.read_csv(file_path)

In [34]:
# Display the first few rows and the missing values
data.head()

Unnamed: 0,ID,Social Media Activity,Questionnaire Completions,Product Feedback,Notes From Previous Conversations
0,1,Active,Completed,Positive,Interested in new features
1,2,Inactive,Not Completed,Neutral,Asked about pricing options
2,3,Active,Completed,Negative,Expressed concerns about usability
3,4,Active,Completed,Positive,Recommended product to friends
4,5,Inactive,Not Completed,Positive,Asked for additional product information


In [35]:
# Check for missing values
data.isnull().sum()

ID                                   0
Social Media Activity                0
Questionnaire Completions            0
Product Feedback                     0
Notes From Previous Conversations    0
dtype: int64

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   ID                                 100 non-null    int64 
 1   Social Media Activity              100 non-null    object
 2   Questionnaire Completions          100 non-null    object
 3   Product Feedback                   100 non-null    object
 4   Notes From Previous Conversations  100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


## Step 3: Handling Missing Data and Noise

In [37]:
# Handling missing data (fill with a placeholder)
data.fillna('Missing', inplace=True)

In [38]:
# Function to clean noise from text
def clean_noise(text):
    # Remove special characters, numbers, punctuations except for emojis
    text = re.sub(r'[^a-zA-Z\s\U0001F600-\U0001F64F]', '', text)
    return text

In [39]:
# Apply the cleaning function to the text column
data['Cleaned_Text'] = data['Notes From Previous Conversations'].apply(clean_noise)

In [40]:
# Display cleaned text
data['Cleaned_Text'].head()

0                  Interested in new features
1                 Asked about pricing options
2          Expressed concerns about usability
3              Recommended product to friends
4    Asked for additional product information
Name: Cleaned_Text, dtype: object

## Step 4: Normalization and Tokenization

In [41]:
# Text normalization (lowercasing)
data['Normalized_Text'] = data['Cleaned_Text'].str.lower()

In [42]:
# Tokenization
data['Tokens'] = data['Normalized_Text'].apply(word_tokenize)

In [43]:
# Display tokens
data['Tokens'].head()

0                   [interested, in, new, features]
1                  [asked, about, pricing, options]
2           [expressed, concerns, about, usability]
3               [recommended, product, to, friends]
4    [asked, for, additional, product, information]
Name: Tokens, dtype: object