In [8]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def clean_text(text):
    """Clean review text"""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

def preprocess_data(input_file, output_file):
    """Preprocess the scraped data"""
    # Create data directory if not exists
    os.makedirs('data', exist_ok=True)
    
    # Load data
    df = pd.read_csv(r"D:\Project\banking-app-reviews-analysis\data\bank_reviews_raw.csv")
    
    # Clean review text
    df['cleaned_review'] = df['review'].apply(clean_text)
    
    # Ensure proper date format
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
    # Save cleaned data
    df.to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

if __name__ == "__main__":
    preprocess_data(
        input_file='data/bank_reviews_raw.csv',
        output_file='data/bank_reviews_clean.csv'
    )

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessed data saved to data/bank_reviews_clean.csv


In [11]:
import pandas as pd

# Load your dataset
df = pd.read_csv(r"D:\Project\banking-app-reviews-analysis\data\bank_reviews_raw.csv")
# Calculate total number of rows
total_rows = len(df)

# Calculate missing values per column
missing_counts = df.isnull().sum()

# Calculate missing percentages
missing_percentages = (missing_counts / total_rows) * 100

# Print missing percentages
print("Missing data (%):")
print(missing_percentages)

# Identify columns exceeding 5% missing
above_5_percent = missing_percentages[missing_percentages > 5]

if not above_5_percent.empty:
    print("\n❌ Columns with more than 5% missing data:")
    print(above_5_percent)
else:
    print("\n✅ All columns have 5% or less missing data.")


Missing data (%):
review    0.0
rating    0.0
date      0.0
bank      0.0
source    0.0
dtype: float64

✅ All columns have 5% or less missing data.
