# Part 1: Data Handling and Preprocessing
## Customer Feedback Analysis System


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\supra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
try:
    df = pd.read_csv('Customer_Feedback.csv', encoding='utf-8', on_bad_lines='skip')
except:
    try:
        df = pd.read_csv('Customer_Feedback.csv', encoding='latin-1')
    except:
        df = pd.read_csv('Customer_Feedback.csv')

print(f"Initial dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x95 in position 238: invalid start byte

In [None]:
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)
print(f"\nDuplicate rows: {df.duplicated().sum()}")


## Data Cleaning Pipeline


In [None]:
df_clean = df.copy()

print(f"Before removing duplicates: {len(df_clean)} rows")
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates: {len(df_clean)} rows")
print(f"Removed {len(df) - len(df_clean)} duplicate entries")


In [None]:
df_clean['Comments'].fillna('', inplace=True)
df_clean['Review Title'].fillna('', inplace=True)
df_clean['Useful'].fillna('', inplace=True)

print("Missing values after filling:")
print(df_clean.isnull().sum())


In [None]:
def extract_numeric_rating(rating_str):
    if pd.isna(rating_str):
        return np.nan
    match = re.search(r'(\d+\.?\d*)', str(rating_str))
    return float(match.group(1)) if match else np.nan

df_clean['Rating_Numeric'] = df_clean['Rating'].apply(extract_numeric_rating)


In [None]:
def parse_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT
    try:
        date_str = str(date_str).replace('on ', '')
        return pd.to_datetime(date_str, format='%d %B %Y', errors='coerce')
    except:
        return pd.NaT

df_clean['Date_Parsed'] = df_clean['Date'].apply(parse_date)


In [None]:
def extract_helpful_count(useful_str):
    if pd.isna(useful_str) or useful_str == '':
        return 0
    match = re.search(r'(\d+)\s*people?\s*found\s*this\s*helpful', str(useful_str))
    if match:
        return int(match.group(1))
    match = re.search(r'One\s*person\s*found\s*this\s*helpful', str(useful_str))
    if match:
        return 1
    return 0

df_clean['Helpful_Count'] = df_clean['Useful'].apply(extract_helpful_count)


## Text Preprocessing Functions


In [None]:
def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s.,!?]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text


In [None]:
def tokenize_text(text):
    if not text or text == '':
        return []
    try:
        tokens = word_tokenize(text.lower())
        return tokens
    except:
        return text.lower().split()


In [None]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return filtered_tokens


In [None]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized


In [None]:
def preprocess_pipeline(text):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    filtered = remove_stopwords(tokens)
    lemmatized = lemmatize_tokens(filtered)
    return ' '.join(lemmatized)


## Apply Preprocessing to Dataset


In [None]:
print("Preprocessing review titles...")
df_clean['Review_Title_Clean'] = df_clean['Review Title'].apply(clean_text)
df_clean['Review_Title_Processed'] = df_clean['Review Title'].apply(preprocess_pipeline)

print("Preprocessing comments...")
df_clean['Comments_Clean'] = df_clean['Comments'].apply(clean_text)
df_clean['Comments_Processed'] = df_clean['Comments'].apply(preprocess_pipeline)

print("Preprocessing completed!")


In [None]:
df_clean['Combined_Text'] = df_clean['Review_Title_Clean'] + ' ' + df_clean['Comments_Clean']
df_clean['Combined_Text_Processed'] = df_clean['Review_Title_Processed'] + ' ' + df_clean['Comments_Processed']


In [None]:
df_clean['Text_Length'] = df_clean['Combined_Text'].apply(len)
df_clean['Word_Count'] = df_clean['Combined_Text'].apply(lambda x: len(x.split()))


## Final Data Quality Checks


In [None]:
print(f"Final dataset shape: {df_clean.shape}")
print(f"\nColumns in cleaned dataset:")
print(df_clean.columns.tolist())
print(f"\nMissing values:")
print(df_clean.isnull().sum())
print(f"\nRating distribution:")
print(df_clean['Rating_Numeric'].value_counts().sort_index())


In [None]:
print("Sample of cleaned data:")
print("\nOriginal Comment:")
print(df_clean.iloc[0]['Comments'][:200])
print("\nCleaned Comment:")
print(df_clean.iloc[0]['Comments_Clean'][:200])
print("\nProcessed Comment:")
print(df_clean.iloc[0]['Comments_Processed'][:200])


In [None]:
df_final = df_clean[[
    'Review Title', 'Customer name', 'Rating', 'Rating_Numeric',
    'Date', 'Date_Parsed', 'Category', 'Comments', 'Useful', 'Helpful_Count',
    'Review_Title_Clean', 'Comments_Clean', 'Combined_Text',
    'Review_Title_Processed', 'Comments_Processed', 'Combined_Text_Processed',
    'Text_Length', 'Word_Count'
]]

df_final.to_csv('cleaned_customer_feedback.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_customer_feedback.csv'")
print(f"Total records: {len(df_final)}")


## Data Summary Statistics


In [None]:
print("=" * 60)
print("DATA PREPROCESSING SUMMARY")
print("=" * 60)
print(f"\nOriginal dataset size: {len(df)} records")
print(f"Cleaned dataset size: {len(df_final)} records")
print(f"Records removed: {len(df) - len(df_final)}")
print(f"\nRating Statistics:")
print(df_final['Rating_Numeric'].describe())
print(f"\nCategory Distribution:")
print(df_final['Category'].value_counts())
print(f"\nText Length Statistics:")
print(df_final['Text_Length'].describe())
print(f"\nWord Count Statistics:")
print(df_final['Word_Count'].describe())
print("=" * 60)


: 