# Part 1: Data Handling and Preprocessing
## Customer Feedback Analysis System


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
try:
    df = pd.read_csv('Customer_Feedback.csv', encoding='utf-8', on_bad_lines='skip')
except:
    try:
        df = pd.read_csv('Customer_Feedback.csv', encoding='latin-1')
    except:
        df = pd.read_csv('Customer_Feedback.csv')

print(f"Initial dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


Initial dataset shape: (280, 7)

Column names: ['Review Title', 'Customer name', 'Rating', 'Date', 'Category', 'Comments', 'Useful']

First few rows:


Unnamed: 0,Review Title,Customer name,Rating,Date,Category,Comments,Useful
0,Another Midrange killer Smartphone by Xiaomi,Rishikumar Thakur,4.0 out of 5 stars,on 1 October 2018,Display,Another Midrange killer Smartphone by Xiaomi\n...,
1,vry small size mobile,Raza ji,3.0 out of 5 stars,on 15 September 2018,Others,All ok but vry small size mobile,7 people found this helpful
2,Full display not working in all application.,Vaibhav Patel,3.0 out of 5 stars,on 18 September 2018,Others,Quite good,7 people found this helpful
3,Value for Money,Amazon Customer,5.0 out of 5 stars,on 28 September 2018,Display,Redmi has always have been the the king of bud...,2 people found this helpful
4,Not worth for the money,Sudhakaran Wadakkancheri,2.0 out of 5 stars,on 18 September 2018,Others,worst product from MI. I am a hardcore fan of ...,6 people found this helpful


In [7]:
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)
print(f"\nDuplicate rows: {df.duplicated().sum()}")


Missing values per column:
Review Title       0
Customer name      0
Rating             0
Date               0
Category           0
Comments           0
Useful           170
dtype: int64

Data types:
Review Title     object
Customer name    object
Rating           object
Date             object
Category         object
Comments         object
Useful           object
dtype: object

Duplicate rows: 21


## Data Cleaning Pipeline


In [8]:
df_clean = df.copy()

print(f"Before removing duplicates: {len(df_clean)} rows")
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates: {len(df_clean)} rows")
print(f"Removed {len(df) - len(df_clean)} duplicate entries")


Before removing duplicates: 280 rows
After removing duplicates: 259 rows
Removed 21 duplicate entries


In [9]:
df_clean['Comments'].fillna('', inplace=True)
df_clean['Review Title'].fillna('', inplace=True)
df_clean['Useful'].fillna('', inplace=True)

print("Missing values after filling:")
print(df_clean.isnull().sum())


Missing values after filling:
Review Title     0
Customer name    0
Rating           0
Date             0
Category         0
Comments         0
Useful           0
dtype: int64


In [10]:
def extract_numeric_rating(rating_str):
    if pd.isna(rating_str):
        return np.nan
    match = re.search(r'(\d+\.?\d*)', str(rating_str))
    return float(match.group(1)) if match else np.nan

df_clean['Rating_Numeric'] = df_clean['Rating'].apply(extract_numeric_rating)


In [11]:
def parse_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT
    try:
        date_str = str(date_str).replace('on ', '')
        return pd.to_datetime(date_str, format='%d %B %Y', errors='coerce')
    except:
        return pd.NaT

df_clean['Date_Parsed'] = df_clean['Date'].apply(parse_date)


In [12]:
def extract_helpful_count(useful_str):
    if pd.isna(useful_str) or useful_str == '':
        return 0
    match = re.search(r'(\d+)\s*people?\s*found\s*this\s*helpful', str(useful_str))
    if match:
        return int(match.group(1))
    match = re.search(r'One\s*person\s*found\s*this\s*helpful', str(useful_str))
    if match:
        return 1
    return 0

df_clean['Helpful_Count'] = df_clean['Useful'].apply(extract_helpful_count)


## Text Preprocessing Functions


In [13]:
def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s.,!?]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text


In [14]:
def tokenize_text(text):
    if not text or text == '':
        return []
    try:
        tokens = word_tokenize(text.lower())
        return tokens
    except:
        return text.lower().split()


In [15]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return filtered_tokens


In [16]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized


In [17]:
def preprocess_pipeline(text):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    filtered = remove_stopwords(tokens)
    lemmatized = lemmatize_tokens(filtered)
    return ' '.join(lemmatized)


## Apply Preprocessing to Dataset


In [18]:
print("Preprocessing review titles...")
df_clean['Review_Title_Clean'] = df_clean['Review Title'].apply(clean_text)
df_clean['Review_Title_Processed'] = df_clean['Review Title'].apply(preprocess_pipeline)

print("Preprocessing comments...")
df_clean['Comments_Clean'] = df_clean['Comments'].apply(clean_text)
df_clean['Comments_Processed'] = df_clean['Comments'].apply(preprocess_pipeline)

print("Preprocessing completed!")


Preprocessing review titles...
Preprocessing comments...
Preprocessing completed!


In [19]:
df_clean['Combined_Text'] = df_clean['Review_Title_Clean'] + ' ' + df_clean['Comments_Clean']
df_clean['Combined_Text_Processed'] = df_clean['Review_Title_Processed'] + ' ' + df_clean['Comments_Processed']


In [20]:
df_clean['Text_Length'] = df_clean['Combined_Text'].apply(len)
df_clean['Word_Count'] = df_clean['Combined_Text'].apply(lambda x: len(x.split()))


## Final Data Quality Checks


In [21]:
print(f"Final dataset shape: {df_clean.shape}")
print(f"\nColumns in cleaned dataset:")
print(df_clean.columns.tolist())
print(f"\nMissing values:")
print(df_clean.isnull().sum())
print(f"\nRating distribution:")
print(df_clean['Rating_Numeric'].value_counts().sort_index())


Final dataset shape: (259, 18)

Columns in cleaned dataset:
['Review Title', 'Customer name', 'Rating', 'Date', 'Category', 'Comments', 'Useful', 'Rating_Numeric', 'Date_Parsed', 'Helpful_Count', 'Review_Title_Clean', 'Review_Title_Processed', 'Comments_Clean', 'Comments_Processed', 'Combined_Text', 'Combined_Text_Processed', 'Text_Length', 'Word_Count']

Missing values:
Review Title               0
Customer name              0
Rating                     0
Date                       0
Category                   0
Comments                   0
Useful                     0
Rating_Numeric             0
Date_Parsed                0
Helpful_Count              0
Review_Title_Clean         0
Review_Title_Processed     0
Comments_Clean             0
Comments_Processed         0
Combined_Text              0
Combined_Text_Processed    0
Text_Length                0
Word_Count                 0
dtype: int64

Rating distribution:
Rating_Numeric
1.0     42
2.0      6
3.0     23
4.0     44
5.0    144

In [22]:
print("Sample of cleaned data:")
print("\nOriginal Comment:")
print(df_clean.iloc[0]['Comments'][:200])
print("\nCleaned Comment:")
print(df_clean.iloc[0]['Comments_Clean'][:200])
print("\nProcessed Comment:")
print(df_clean.iloc[0]['Comments_Processed'][:200])


Sample of cleaned data:

Original Comment:
Another Midrange killer Smartphone by Xiaomi

Major Highlights:
 The Redmi 6 Pro sports a 5.84-inch full-HD+ display with a notch
 Powered by the Qualcomm Snapdragon 625 SoC
 The phone is priced at

Cleaned Comment:
Another Midrange killer Smartphone by Xiaomi Major Highlights The Redmi 6 Pro sports a 5.84 inch full HD display with a notch Powered by the Qualcomm Snapdragon 625 SoC The phone is priced at Rs. 10,9

Processed Comment:
another midrange killer smartphone xiaomi major highlight redmi pro sport 5.84 inch full display notch powered qualcomm snapdragon 625 soc phone priced 10,999 3gb ram variant start point battery 4000 


In [23]:
df_final = df_clean[[
    'Review Title', 'Customer name', 'Rating', 'Rating_Numeric',
    'Date', 'Date_Parsed', 'Category', 'Comments', 'Useful', 'Helpful_Count',
    'Review_Title_Clean', 'Comments_Clean', 'Combined_Text',
    'Review_Title_Processed', 'Comments_Processed', 'Combined_Text_Processed',
    'Text_Length', 'Word_Count'
]]

df_final.to_csv('cleaned_customer_feedback.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_customer_feedback.csv'")
print(f"Total records: {len(df_final)}")



Cleaned dataset saved as 'cleaned_customer_feedback.csv'
Total records: 259


## Data Summary Statistics


In [24]:
print("=" * 60)
print("DATA PREPROCESSING SUMMARY")
print("=" * 60)
print(f"\nOriginal dataset size: {len(df)} records")
print(f"Cleaned dataset size: {len(df_final)} records")
print(f"Records removed: {len(df) - len(df_final)}")
print(f"\nRating Statistics:")
print(df_final['Rating_Numeric'].describe())
print(f"\nCategory Distribution:")
print(df_final['Category'].value_counts())
print(f"\nText Length Statistics:")
print(df_final['Text_Length'].describe())
print(f"\nWord Count Statistics:")
print(df_final['Word_Count'].describe())
print("=" * 60)


DATA PREPROCESSING SUMMARY

Original dataset size: 280 records
Cleaned dataset size: 259 records
Records removed: 21

Rating Statistics:
count    259.000000
mean       3.934363
std        1.483611
min        1.000000
25%        3.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: Rating_Numeric, dtype: float64

Category Distribution:
Category
Others      168
Display      32
Battery      27
Camera       26
Delivery      6
Name: count, dtype: int64

Text Length Statistics:
count     259.000000
mean      152.926641
std       401.347573
min         5.000000
25%        22.000000
50%        50.000000
75%       117.500000
max      4579.000000
Name: Text_Length, dtype: float64

Word Count Statistics:
count    259.000000
mean      28.146718
std       75.911883
min        2.000000
25%        4.000000
50%        9.000000
75%       21.500000
max      860.000000
Name: Word_Count, dtype: float64
