# STEP 1. Project Setup and Data Aquisition

### 1.1 Importing Necessary Libraries

In [16]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import nltk 
import os  
import warnings 

### 1.2 Configure Settings 

In [17]:
# Ignore warning for cleaner output (use with caution, understanding the warning first)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',80)
pd.set_option('display.float_format',lambda x: '%.3f' %x)
sns.set_style('whitegrid')
print('Libraries imported and configured.')


Libraries imported and configured.


### 1.3 Define Constants and File Paths

In [18]:
DATA_DIR = '../data/'
RAW_DATA_FILE =os.path.join(DATA_DIR,'Womens Clothing E-Commerce Reviews.csv')
print(f'Raw data file path:{RAW_DATA_FILE}')

Raw data file path:../data/Womens Clothing E-Commerce Reviews.csv


In [19]:
# Data Loading functions 
def load_data(file_path, index_col =0): 
    '''  
    Loads data from a CSV file.
    Args: 
        file_path (str): The path to the CSV file.
        index_col (int or None): Column to use as the row labels.
    Returns:
        pandas.DataFrame: Loaded DataFrame or None if file not found.
    '''
    try:
        df = pd.read_csv(file_path,index_col=index_col)
        print(f'Data loaded successfully from {file_path}. Shape :{df.shape}')
        return df 
    except FileNotFoundError: 
        print(f'Error: File not found at {file_path}')
        return None 
    except Exception as e: 
        print(f'An error occured while loading data: {e}')
        return None



In [20]:
#load the raw dataset 
df_raw = load_data(RAW_DATA_FILE)

Data loaded successfully from ../data/Womens Clothing E-Commerce Reviews.csv. Shape :(23486, 10)


### 1.5 Initial Data Inspection 

In [22]:
if df_raw is not None: 
    print('\n -- First 5 Rows of Raw Data --')
    print(df_raw.head())
    print('\n --- Raw data Information---')
    df_raw.info(verbose=True,show_counts=True)
    


 -- First 5 Rows of Raw Data --
   Clothing ID  Age                    Title  \
0          767   33                      NaN   
1         1080   34                      NaN   
2         1077   60  Some major design flaws   
3         1049   50         My favorite buy!   
4          847   47         Flattering shirt   

                                                                       Review Text  \
0                            Absolutely wonderful - silky and sexy and comfortable   
1  Love this dress!  it's sooo pretty.  i happened to find it in a store, and i...   
2  I had such high hopes for this dress and really wanted it to work for me. i ...   
3  I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time...   
4  This shirt is very flattering to all due to the adjustable front tie. it is ...   

   Rating  Recommended IND  Positive Feedback Count   Division Name  \
0       4                1                        0       Initmates   
1       5          

In [23]:
print('\n --- Summary statistics (Numerical columns)---')
df_raw.describe().T # transpose for better readeability


 --- Summary statistics (Numerical columns)---


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Clothing ID,23486.0,918.119,203.299,0.0,861.0,936.0,1078.0,1205.0
Age,23486.0,43.199,12.28,18.0,34.0,41.0,52.0,99.0
Rating,23486.0,4.196,1.11,1.0,4.0,5.0,5.0,5.0
Recommended IND,23486.0,0.822,0.382,0.0,1.0,1.0,1.0,1.0
Positive Feedback Count,23486.0,2.536,5.702,0.0,0.0,1.0,3.0,122.0


In [25]:
print("\n--- Summary Statistics (Object/Categorical Columns) ---")
print(df_raw.describe(include=['object', 'category']).T)


--- Summary Statistics (Object/Categorical Columns) ---
                 count unique  \
Title            19676  13993   
Review Text      22641  22634   
Division Name    23472      3   
Department Name  23472      6   
Class Name       23472     20   

                                                                                             top  \
Title                                                                                   Love it!   
Review Text      Perfect fit and i've gotten so many compliments. i buy all my suits from her...   
Division Name                                                                            General   
Department Name                                                                             Tops   
Class Name                                                                               Dresses   

                  freq  
Title              136  
Review Text          3  
Division Name    13850  
Department Name  10468  
Class Name        

In [26]:
print('\n --- Missing Values per Columns (Raw Data)')
missing_values = df_raw.isnull().sum()  
missing_percent = (missing_values/len(df_raw))*100 
missing_df  = pd.DataFrame({'count':missing_values,'Percentage':missing_percent})
missing_df =missing_df[missing_df['count']>0].sort_values(by='Percentage',ascending=False)

if not missing_df.empty: 
    print(missing_df)   
else: 
    print('No Missing values found in the raw dataset.')


 --- Missing Values per Columns (Raw Data)
                 count  Percentage
Title             3810      16.222
Review Text        845       3.598
Division Name       14       0.060
Department Name     14       0.060
Class Name          14       0.060


In [27]:
print('\n ---Duplicate Rows Check (Raw Data)---')
num_duplicates  =  df_raw.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')
if num_duplicates>0: 
    df_raw = df_raw.drop_duplicates(keep='first').reset_index(drop=True)    
    print(f'Dropped duplicates. New shape : {df_raw.shape}')
else: 
    print('Data loading failed. Cannot proceed with inspection.')


 ---Duplicate Rows Check (Raw Data)---
Number of duplicate rows: 21
Dropped duplicates. New shape : (23465, 10)


### 1.5 Preliminary Data Selection & Cleaning (For NLP focus)   

In [28]:
''' 
1. For our NLP project, the 'Review text' is crucial,
2. We will also likely use 'rating', 'title', and potentially 'Recommended IND'. 
3. Other columns like 'Department Name', 'Class Name' can be used for segmentation later. 
'''
if df_raw is not None: 
    TEXT_COLUMN = 'Review Text'
    TITLE_COLUMN = 'Title' # Also often useful
    RATING_COLUMN = 'Rating'

    if TEXT_COLUMN not in df_raw.columns:
        print(f"Critical Error: The primary text column '{TEXT_COLUMN}' is not found in the dataset!")
        print(f"Available columns are: {df_raw.columns.tolist()}")
        # Potentially stop execution or ask user to specify
    else:
        # Handle missing review texts
        print(f"\nNumber of missing values in '{TEXT_COLUMN}' before handling: {df_raw[TEXT_COLUMN].isnull().sum()}")
        
        # Create a working copy for NLP tasks
        df_nlp = df_raw.copy()
        
        #  Fill NaN in 'Title' with an empty string if you plan to combine it or use it
        if TITLE_COLUMN in df_nlp.columns:
            df_nlp[TITLE_COLUMN].fillna('', inplace=True)
            print(f"Missing values in '{TITLE_COLUMN}' after filling with empty string: {df_nlp[TITLE_COLUMN].isnull().sum()}")

        # Ensure text columns are of string type
        df_nlp[TEXT_COLUMN] = df_nlp[TEXT_COLUMN].astype(str)
        if TITLE_COLUMN in df_nlp.columns:
            df_nlp[TITLE_COLUMN] = df_nlp[TITLE_COLUMN].astype(str)

        print(f"\n--- Data Types of Key Columns after initial processing ('df_nlp') ---")
        if RATING_COLUMN in df_nlp.columns:
            print(df_nlp[[TEXT_COLUMN, TITLE_COLUMN, RATING_COLUMN]].info())
        else:
            print(df_nlp[[TEXT_COLUMN, TITLE_COLUMN]].info())
            
        print(f"\n--- First 5 rows of 'df_nlp' (our main working DataFrame for now) ---")
        print(df_nlp.head())
else:
    print("Skipping preliminary data selection due to data loading issues.")


Number of missing values in 'Review Text' before handling: 825
Missing values in 'Title' after filling with empty string: 0

--- Data Types of Key Columns after initial processing ('df_nlp') ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23465 entries, 0 to 23464
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review Text  23465 non-null  object
 1   Title        23465 non-null  object
 2   Rating       23465 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 550.1+ KB
None

--- First 5 rows of 'df_nlp' (our main working DataFrame for now) ---
   Clothing ID  Age                    Title  \
0          767   33                            
1         1080   34                            
2         1077   60  Some major design flaws   
3         1049   50         My favorite buy!   
4          847   47         Flattering shirt   

                                                                       Rev

In [29]:
df_nlp.to_csv(os.path.join(DATA_DIR, 'reviews_nlp_ready_step1.csv'), index=False)
print("df_nlp saved to 'reviews_nlp_ready_step1.csv'")

df_nlp saved to 'reviews_nlp_ready_step1.csv'
