In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import process

# Load dataset # Update with correct path
df = pd.read_csv("Tours_and_Travels.csv")

# Display basic info
df.info()

# Handle missing values
# Fill missing Review_Text with "No review provided"
df['Review_Text'].fillna("No review provided", inplace=True)

# Impute missing numerical values
num_imputer = SimpleImputer(strategy='median')
df['Customer_Age'] = num_imputer.fit_transform(df[['Customer_Age']])
df['Rating'] = df['Rating'].fillna(df['Rating'].mode()[0])  # Fill with mode

# Detect and remove duplicates
df.drop_duplicates(inplace=True)

# Handle inconsistent data
# Ensure Rating values are between 1-5
df['Rating'] = np.clip(df['Rating'], 1, 5)

# Standardize Tour_Package names using fuzzy matching
def correct_tour_name(name, choices):
    return process.extractOne(name, choices)[0] if pd.notna(name) else name

tour_names = df['Tour_Package'].dropna().unique()
df['Tour_Package'] = df['Tour_Package'].apply(lambda x: correct_tour_name(x, tour_names))

# Identify and handle outliers
# Boxplot for detecting anomalies
plt.figure(figsize=(10, 5))
sns.boxplot(x=df['Package_Price'])
plt.show()

# Removing extreme outliers using IQR
Q1 = df['Package_Price'].quantile(0.25)
Q3 = df['Package_Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Package_Price'] >= lower_bound) & (df['Package_Price'] <= upper_bound)]

# Prepare data for analysis
# Convert categorical data into numerical format
encoder = LabelEncoder()
df['Tour_Package'] = encoder.fit_transform(df['Tour_Package'])

# Save cleaned data
df.to_csv("Cleaned_Travel_Reviews.csv", index=False)
print("Data cleaning complete. File saved as 'Cleaned_Travel_Reviews.csv'")

ModuleNotFoundError: No module named 'fuzzywuzzy'