# Week 8 - Data Cleansing and Transformation (Solo Submission)
**Rishabh Raman | Visionary Analysts | Data Science**

In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import KNNImputer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# --- Load Dataset ---
df = pd.read_csv('your_dataset.csv')  # Replace with actual path
df.head()

In [None]:
# --- Missing Value Imputation ---
# Technique 1: Mean
df['age'] = df['age'].fillna(df['age'].mean())
# Technique 2: KNN
imputer = KNNImputer(n_neighbors=3)
df[['bmi', 'salary']] = imputer.fit_transform(df[['bmi', 'salary']])
# REVIEW COMMENT: Mean is simple but may ignore variance; KNN preserves structure.

In [None]:
# --- Outlier Handling ---
# Technique 1: IQR Method
Q1 = df['income'].quantile(0.25)
Q3 = df['income'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df['income'] = np.where(df['income'] > upper, upper,
                        np.where(df['income'] < lower, lower, df['income']))

# Technique 2: Z-Score
from scipy import stats
z = np.abs(stats.zscore(df['expenses']))
df = df[(z < 3)]
# REVIEW COMMENT: Z-score good for normal data; IQR better for skewed.

In [None]:
# --- Text Cleaning ---
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text.lower()

df['cleaned_review'] = df['review'].astype(str).apply(clean_text)

In [None]:
# Lemmatization
def lemmatize(text):
    doc = nlp(text)
    return ' '.join([t.lemma_ for t in doc if not t.is_stop])
df['lemmatized_review'] = df['cleaned_review'].apply(lemmatize)

In [None]:
# Featurization
tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(df['lemmatized_review'])

cv = CountVectorizer(max_features=100)
X_cv = cv.fit_transform(df['lemmatized_review'])
# REVIEW COMMENT: TF-IDF better for highlighting rare terms.